This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 2bb46241f Tika-4338 -- remove tagsoup entirely (#2011)
2bb46241f is described below
commit 2bb46241f13c7d3d63dd36f5bb88e910eee7ff8c
Author: Tim Allison <[email protected]>
AuthorDate: Fri Oct 25 09:58:31 2024 -0400
Tika-4338 -- remove tagsoup entirely (#2011)
---
tika-bom/pom.xml | 10 --
tika-bundles/tika-bundle-standard/pom.xml | 2 +-
.../tika/sax/xpath/MatchingContentHandler.java | 3 +-
tika-eval/tika-eval-app/pom.xml | 2 +-
.../apache/tika/eval/app/SimpleComparerTest.java | 2 +-
tika-eval/tika-eval-core/pom.xml | 4 +-
.../tika/eval/core/util/ContentTagParser.java | 113 +++++++++++-
tika-parent/pom.xml | 6 -
.../tika-parser-code-module/pom.xml | 4 +-
.../apache/tika/parser/code/SourceCodeParser.java | 198 ++++++++++++++++-----
.../tika/parser/code/SourceCodeParserTest.java | 8 +-
.../tika/parser/microsoft/chm/ChmParser.java | 11 +-
tika-server/tika-server-eval/pom.xml | 2 +-
13 files changed, 279 insertions(+), 86 deletions(-)
diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml
index a61b77bd3..bf5d99806 100644
--- a/tika-bom/pom.xml
+++ b/tika-bom/pom.xml
@@ -257,11 +257,6 @@
<artifactId>tika-parser-sqlite3-package</artifactId>
<version>3.0.1-SNAPSHOT</version>
</dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-tagsoup-package</artifactId>
- <version>3.0.0-SNAPSHOT</version>
- </dependency>
<!-- Tika parsers modules (extended package) -->
<dependency>
@@ -274,11 +269,6 @@
<artifactId>tika-parser-sqlite3-module</artifactId>
<version>3.0.1-SNAPSHOT</version>
</dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-tagsoup-module</artifactId>
- <version>3.0.0-SNAPSHOT</version>
- </dependency>
<!-- Tika parsers modules (ML package) -->
<dependency>
<groupId>org.apache.tika</groupId>
diff --git a/tika-bundles/tika-bundle-standard/pom.xml
b/tika-bundles/tika-bundle-standard/pom.xml
index 1b0ea87f8..851cac644 100644
--- a/tika-bundles/tika-bundle-standard/pom.xml
+++ b/tika-bundles/tika-bundle-standard/pom.xml
@@ -174,7 +174,7 @@
jackcess|
jackcess-encrypt|
commons-lang3|
- tagsoup|
+ jsoup|
asm|
juniversalchardet|
vorbis-java-core|
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
index 9f96186aa..831611c06 100644
---
a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
+++
b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
@@ -69,8 +69,7 @@ public class MatchingContentHandler extends
ContentHandlerDecorator {
if (matcher.matchesElement()) {
super.endElement(uri, localName, name);
}
- // Sometimes tagsoup returns double end tags, so the stack might
- // be empty! TODO: Remove this when the tagsoup problem is fixed.
+ // this was originally added for tagsoup, but we need it generally
if (!matchers.isEmpty()) {
matcher = matchers.removeFirst();
}
diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml
index 7b80203f8..3d75fea31 100644
--- a/tika-eval/tika-eval-app/pom.xml
+++ b/tika-eval/tika-eval-app/pom.xml
@@ -101,7 +101,7 @@
<exclude>org.apache.lucene:lucene-core:jar:</exclude>
<exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
<exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
- <exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
+ <exclude>org.jsoup:jsoup:jar:</exclude>
<exclude>com.ibm.icu:icu4j:jar:</exclude>
<exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>
<exclude>com.fasterxml.jackson.core:jackson-databind:jar:</exclude>
diff --git
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
index 035869c9a..6b900bab3 100644
---
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
+++
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
@@ -279,7 +279,7 @@ public class SimpleComparerTest extends TikaTest {
List<Map<Cols, String>> tableInfosB =
WRITER.getTable(ExtractComparer.TAGS_TABLE_B);
assertEquals(1, tableInfosB.size());
Map<Cols, String> tableInfoB = tableInfosB.get(0);
- //there actually is a tag problem, but tagsoup fixes it.
+ //there actually is a tag problem, but jsoup fixes it.
//this confirms behavior.
assertEquals("false", tableInfoB.get(Cols.TAGS_PARSE_EXCEPTION));
}
diff --git a/tika-eval/tika-eval-core/pom.xml b/tika-eval/tika-eval-core/pom.xml
index 252eedc2e..60af7af1f 100644
--- a/tika-eval/tika-eval-core/pom.xml
+++ b/tika-eval/tika-eval-core/pom.xml
@@ -75,8 +75,8 @@
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
- <groupId>org.ccil.cowan.tagsoup</groupId>
- <artifactId>tagsoup</artifactId>
+ <groupId>org.jsoup</groupId>
+ <artifactId>jsoup</artifactId>
</dependency>
<dependency>
diff --git
a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java
b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java
index 1e3511d63..c3eecc252 100644
---
a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java
+++
b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java
@@ -20,14 +20,24 @@ package org.apache.tika.eval.core.util;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
+import javax.xml.XMLConstants;
-import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Attribute;
+import org.jsoup.nodes.DataNode;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.select.NodeFilter;
+import org.jsoup.select.NodeTraversor;
import org.xml.sax.Attributes;
-import org.xml.sax.InputSource;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
@@ -53,11 +63,106 @@ public class ContentTagParser {
Map<String, Integer> tags = new HashMap<>();
XHTMLContentTagHandler xhtmlContentTagHandler =
new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags);
- SAXParserImpl.newInstance(null)
- .parse(new InputSource(new StringReader(html)),
xhtmlContentTagHandler);
+ Document document = Jsoup.parse(html);
+ NodeTraversor.filter(new TikaNodeFilter(xhtmlContentTagHandler),
document);
+
return new ContentTags(xhtmlContentTagHandler.toString(), tags);
}
+ private static class TikaNodeFilter implements NodeFilter {
+ boolean ignore = true;
+ ContentHandler handler;
+
+ private TikaNodeFilter(ContentHandler handler) {
+ this.handler = handler;
+ }
+
+ @Override
+ public NodeFilter.FilterResult head(Node node, int i) {
+ //skip document fragment
+ if ("html".equals(node.nodeName())) {
+ ignore = false;
+ }
+ if (ignore) {
+ return FilterResult.CONTINUE;
+ }
+ if (node instanceof TextNode) {
+ String txt = ((TextNode) node).getWholeText();
+ if (txt != null) {
+ char[] chars = txt.toCharArray();
+ try {
+ if (chars.length > 0) {
+ handler.characters(chars, 0, chars.length);
+ }
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ } else if (node instanceof DataNode) {
+ //maybe handle script data directly here instead of
+ //passing it through to the HTMLHandler?
+ String txt = ((DataNode) node).getWholeData();
+ if (txt != null) {
+ char[] chars = txt.toCharArray();
+ try {
+ if (chars.length > 0) {
+ handler.characters(chars, 0, chars.length);
+ }
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+ AttributesImpl attributes = new AttributesImpl();
+ Iterator<Attribute> jsoupAttrs = node
+ .attributes()
+ .iterator();
+ while (jsoupAttrs.hasNext()) {
+ Attribute jsoupAttr = jsoupAttrs.next();
+ attributes.addAttribute("", jsoupAttr.getKey(),
jsoupAttr.getKey(), "", jsoupAttr.getValue());
+ }
+ try {
+ handler.startElement("", node.nodeName(), node.nodeName(),
attributes);
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+
+ @Override
+ public NodeFilter.FilterResult tail(Node node, int i) {
+ if ("html".equals(node.nodeName())) {
+ ignore = true;
+ }
+ if (ignore) {
+ return FilterResult.CONTINUE;
+ }
+ if (node instanceof TextNode || node instanceof DataNode) {
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+
+ try {
+ handler.endElement(XMLConstants.NULL_NS_URI, node.nodeName(),
node.nodeName());
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+ }
+
+ private static class RuntimeSAXException extends RuntimeException {
+ private SAXException wrapped;
+
+ private RuntimeSAXException(SAXException e) {
+ this.wrapped = e;
+ }
+
+ SAXException getWrapped() {
+ return wrapped;
+ }
+ }
private static class XHTMLContentTagHandler extends ToTextContentHandler {
//Used to have a stack to make sure that starting/ending tags were
matched
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 6b589591b..529654e04 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -435,7 +435,6 @@
<spring.version>5.3.39</spring.version>
<sqlite.version>3.47.0.0</sqlite.version>
<stax.ex.version>2.1.0</stax.ex.version>
- <tagsoup.version>1.2.1</tagsoup.version>
<testcontainers.version>1.20.3</testcontainers.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent -->
<tukaani.version>1.10</tukaani.version>
@@ -910,11 +909,6 @@
<artifactId>bcprov-jdk18on</artifactId>
<version>${bouncycastle.version}</version>
</dependency>
- <dependency>
- <groupId>org.ccil.cowan.tagsoup</groupId>
- <artifactId>tagsoup</artifactId>
- <version>${tagsoup.version}</version>
- </dependency>
<dependency>
<groupId>org.freemarker</groupId>
<artifactId>freemarker</artifactId>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml
index 7fefaa7c5..bdd44b443 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml
@@ -41,8 +41,8 @@
<version>${jhighlight.version}</version>
</dependency>
<dependency>
- <groupId>org.ccil.cowan.tagsoup</groupId>
- <artifactId>tagsoup</artifactId>
+ <groupId>org.jsoup</groupId>
+ <artifactId>jsoup</artifactId>
</dependency>
<dependency>
<groupId>org.ow2.asm</groupId>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
index a3d2a4b48..c11f20d36 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
@@ -22,22 +22,29 @@ import static
org.codelibs.jhighlight.renderer.XhtmlRendererFactory.JAVA;
import java.io.IOException;
import java.io.InputStream;
-import java.io.StringReader;
import java.nio.charset.Charset;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import javax.xml.XMLConstants;
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
import org.codelibs.jhighlight.renderer.Renderer;
import org.codelibs.jhighlight.renderer.XhtmlRendererFactory;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Attribute;
+import org.jsoup.nodes.DataNode;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.select.NodeFilter;
+import org.jsoup.select.NodeTraversor;
import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
@@ -47,6 +54,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
/**
* Generic Source code parser for Java, Groovy, C++.
@@ -61,19 +69,15 @@ public class SourceCodeParser extends
AbstractEncodingDetectorParser {
private static final Pattern AUTHORPATTERN = Pattern.compile("(?im)@author
(.*) *$");
- private static final Map<MediaType, String> TYPES_TO_RENDERER =
- new HashMap<MediaType, String>() {
- private static final long serialVersionUID =
-741976157563751152L;
+ private static final Map<MediaType, String> TYPES_TO_RENDERER = new
HashMap<MediaType, String>() {
+ private static final long serialVersionUID = -741976157563751152L;
- {
- put(MediaType.text("x-c++src"), CPP);
- put(MediaType.text("x-java-source"), JAVA);
- put(MediaType.text("x-groovy"), GROOVY);
- }
- };
-
- //Parse the HTML document
- private static final Schema HTML_SCHEMA = new HTMLSchema();
+ {
+ put(MediaType.text("x-c++src"), CPP);
+ put(MediaType.text("x-java-source"), JAVA);
+ put(MediaType.text("x-groovy"), GROOVY);
+ }
+ };
public SourceCodeParser() {
super();
@@ -89,50 +93,57 @@ public class SourceCodeParser extends
AbstractEncodingDetectorParser {
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata
metadata,
- ParseContext context) throws IOException, SAXException,
TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
try (AutoDetectReader reader = new
AutoDetectReader(CloseShieldInputStream.wrap(stream),
metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
String mediaType = metadata.get(Metadata.CONTENT_TYPE);
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
- if (mediaType != null && name != null) {
- MediaType type = MediaType.parse(mediaType);
+ MediaType type = null;
+ if (mediaType != null) {
+ type = MediaType.parse(mediaType);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
- StringBuilder out = new StringBuilder();
- String line;
- int nbLines = 0;
- while ((line = reader.readLine()) != null) {
-
out.append(line).append(System.getProperty("line.separator"));
- String author = parserAuthor(line);
- if (author != null) {
- metadata.add(TikaCoreProperties.CREATOR, author);
- }
- nbLines++;
+ } else {
+ throw new TikaException("media type must be set in metadata
before parse");
+ }
+ StringBuilder out = new StringBuilder();
+ String line;
+ int nbLines = 0;
+ while ((line = reader.readLine()) != null) {
+ out
+ .append(line)
+ .append(System.getProperty("line.separator"));
+ String author = parserAuthor(line);
+ if (author != null) {
+ metadata.add(TikaCoreProperties.CREATOR, author);
}
- metadata.set("LoC", String.valueOf(nbLines));
- Renderer renderer = getRenderer(type.toString());
-
- String codeAsHtml = renderer.highlight(name, out.toString(),
charset.name(), false);
-
- Schema schema = context.get(Schema.class, HTML_SCHEMA);
+ nbLines++;
+ }
+ metadata.set("LoC", String.valueOf(nbLines));
+ Renderer renderer = getRenderer(type.toString());
- org.ccil.cowan.tagsoup.Parser parser = new
org.ccil.cowan.tagsoup.Parser();
-
parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
- parser.setContentHandler(handler);
- parser.parse(new InputSource(new StringReader(codeAsHtml)));
+ String codeAsHtml = renderer.highlight(name, out.toString(),
charset.name(), false);
+ Document document = Jsoup.parse(codeAsHtml);
+ document.quirksMode(Document.QuirksMode.quirks);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
+ xhtml.startDocument();
+ try {
+ NodeTraversor.filter(new TikaNodeFilter(xhtml), document);
+ } catch (RuntimeSAXException e) {
+ throw e.getWrapped();
+ } finally {
+ xhtml.endDocument();
}
}
-
}
- private Renderer getRenderer(String mimeType) {
+ private Renderer getRenderer(String mimeType) throws TikaException {
MediaType mt = MediaType.parse(mimeType);
String type = TYPES_TO_RENDERER.get(mt);
if (type == null) {
- throw new RuntimeException("unparseable content type " + mimeType);
+ throw new TikaException("unparseable content type " + mimeType);
}
return XhtmlRendererFactory.getRenderer(type);
}
@@ -141,9 +152,106 @@ public class SourceCodeParser extends
AbstractEncodingDetectorParser {
private String parserAuthor(String line) {
Matcher m = AUTHORPATTERN.matcher(line);
if (m.find()) {
- return m.group(1).trim();
+ return m
+ .group(1)
+ .trim();
}
return null;
}
+
+ private static class TikaNodeFilter implements NodeFilter {
+ boolean ignore = true;
+ ContentHandler handler;
+
+ private TikaNodeFilter(ContentHandler handler) {
+ this.handler = handler;
+ }
+
+ @Override
+ public NodeFilter.FilterResult head(Node node, int i) {
+ //skip document fragment
+ if ("html".equals(node.nodeName())) {
+ ignore = false;
+ }
+ if (ignore) {
+ return FilterResult.CONTINUE;
+ }
+ if (node instanceof TextNode) {
+ String txt = ((TextNode) node).getWholeText();
+ if (txt != null) {
+ char[] chars = txt.toCharArray();
+ try {
+ if (chars.length > 0) {
+ handler.characters(chars, 0, chars.length);
+ }
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ } else if (node instanceof DataNode) {
+ //maybe handle script data directly here instead of
+ //passing it through to the HTMLHandler?
+ String txt = ((DataNode) node).getWholeData();
+ if (txt != null) {
+ char[] chars = txt.toCharArray();
+ try {
+ if (chars.length > 0) {
+ handler.characters(chars, 0, chars.length);
+ }
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+ AttributesImpl attributes = new AttributesImpl();
+ Iterator<Attribute> jsoupAttrs = node
+ .attributes()
+ .iterator();
+ while (jsoupAttrs.hasNext()) {
+ Attribute jsoupAttr = jsoupAttrs.next();
+ attributes.addAttribute("", jsoupAttr.getKey(),
jsoupAttr.getKey(), "", jsoupAttr.getValue());
+ }
+ try {
+ handler.startElement("", node.nodeName(), node.nodeName(),
attributes);
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+
+ @Override
+ public NodeFilter.FilterResult tail(Node node, int i) {
+ if ("html".equals(node.nodeName())) {
+ ignore = true;
+ }
+ if (ignore) {
+ return FilterResult.CONTINUE;
+ }
+ if (node instanceof TextNode || node instanceof DataNode) {
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+
+ try {
+ handler.endElement(XMLConstants.NULL_NS_URI, node.nodeName(),
node.nodeName());
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+ }
+
+ private static class RuntimeSAXException extends RuntimeException {
+ private SAXException wrapped;
+
+ private RuntimeSAXException(SAXException e) {
+ this.wrapped = e;
+ }
+
+ SAXException getWrapped() {
+ return wrapped;
+ }
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
index 45758f89b..e932c066d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
@@ -56,12 +56,12 @@ public class SourceCodeParserTest extends TikaTest {
getXML(getResourceAsStream("/test-documents/testJAVA.java"),
sourceCodeParser,
createMetadata("text/x-java-source")).xml;
- assertTrue(htmlContent.indexOf("<html:html lang=\"en\"
xml:lang=\"en\"") == 0);
+ assertTrue(htmlContent.indexOf("<html xmlns=\"http") == 0);
+ assertContains("xml:lang=\"en\"", htmlContent);
assertTrue(htmlContent.indexOf(
- "<html:span class=\"java_keyword\">public</span><html:span
class=\"java_plain\">") >
+ "<span class=\"java_keyword\">public</span><span
class=\"java_plain\">") >
0);
- assertTrue(htmlContent.indexOf("<html:span
class=\"java_keyword\">static</span>") > 0);
- assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
+ assertTrue(htmlContent.indexOf("<span
class=\"java_keyword\">static</span>") > 0);
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
index c60d133b7..0255a9161 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
@@ -93,16 +93,13 @@ public class ChmParser implements Parser {
private void parsePage(byte[] byteObject, Parser htmlParser,
ContentHandler xhtml,
- ParseContext context) throws TikaException,
SAXException { // throws IOException
+ ParseContext context) throws TikaException,
IOException, SAXException { // throws IOException
InputStream stream = null;
Metadata metadata = new Metadata();
ContentHandler handler = new EmbeddedContentHandler(new
BodyContentHandler(xhtml));// -1
- try {
- stream =
UnsynchronizedByteArrayInputStream.builder().setByteArray(byteObject).get();
- htmlParser.parse(stream, handler, metadata, context);
- } catch (IOException e) {
- // Pushback overflow from tagsoup
- }
+ stream =
UnsynchronizedByteArrayInputStream.builder().setByteArray(byteObject).get();
+ htmlParser.parse(stream, handler, metadata, context);
+
}
}
diff --git a/tika-server/tika-server-eval/pom.xml
b/tika-server/tika-server-eval/pom.xml
index 4e7275f3e..52f8f594e 100644
--- a/tika-server/tika-server-eval/pom.xml
+++ b/tika-server/tika-server-eval/pom.xml
@@ -69,7 +69,7 @@
<exclude>org.apache.lucene:lucene-core:jar:</exclude>
<exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
<exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
- <exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
+ <exclude>org.jsoup:jar:</exclude>
<exclude>com.ibm.icu:icu4j:jar:</exclude>
<exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>
<exclude>com.fasterxml.jackson.core:jackson-databind:jar:</exclude>