Author: jmssiera
Date: Fri Nov 19 09:47:07 2021
New Revision: 1895172

URL: http://svn.apache.org/viewvc?rev=1895172&view=rev
Log:
Fix CONNECTORS-1679

Modified:
    
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java

Modified: 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1895172&r1=1895171&r2=1895172&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 (original)
+++ 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 Fri Nov 19 09:47:07 2021
@@ -34,6 +34,7 @@ import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import org.jsoup.nodes.Document.OutputSettings;
 import org.jsoup.nodes.Entities.EscapeMode;
+import org.jsoup.parser.Parser;
 import org.jsoup.safety.Whitelist;
 
 public class JsoupProcessing {
@@ -179,6 +180,8 @@ public class JsoupProcessing {
     else {
       finalDoc = Jsoup.clean(docToKeep.html(),Whitelist.relaxed());
     }
+    // Jsoup escapes entities (eg '&' becomes '&'), we don't want that as 
the extracted text must remain as it is, so unescape them 
+    finalDoc = Parser.unescapeEntities(finalDoc, true);
     metadata.put("extractedDoc",finalDoc);
 
     return metadata;


Reply via email to