Author: jmssiera
Date: Fri Nov 19 09:47:07 2021
New Revision: 1895172
URL: http://svn.apache.org/viewvc?rev=1895172&view=rev
Log:
Fix CONNECTORS-1679
Modified:
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Modified:
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1895172&r1=1895171&r2=1895172&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
(original)
+++
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Fri Nov 19 09:47:07 2021
@@ -34,6 +34,7 @@ import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Entities.EscapeMode;
+import org.jsoup.parser.Parser;
import org.jsoup.safety.Whitelist;
public class JsoupProcessing {
@@ -179,6 +180,8 @@ public class JsoupProcessing {
else {
finalDoc = Jsoup.clean(docToKeep.html(),Whitelist.relaxed());
}
+ // Jsoup escapes entities (eg '&' becomes '&'), we don't want that as
the extracted text must remain as it is, so unescape them
+ finalDoc = Parser.unescapeEntities(finalDoc, true);
metadata.put("extractedDoc",finalDoc);
return metadata;