Author: kwright
Date: Fri Dec 11 17:58:53 2020
New Revision: 1884331

URL: http://svn.apache.org/viewvc?rev=1884331&view=rev
Log:
Improvements for CONNECTORS-1660

Modified:
    manifoldcf/trunk/CHANGES.txt
    
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1884331&r1=1884330&r2=1884331&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Dec 11 17:58:53 2020
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 2.18-dev =====================
 
+CONNECTORS-1660: Better handling of non-correctly-formatted HTML.
+(Olivier Tavard)
+
 NOTICKET: Add missing Jetty JSP jar so crawler UI works in the examples.
 (Karl Wright)
 

Modified: 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1884331&r1=1884330&r2=1884331&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 (original)
+++ 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 Fri Dec 11 17:58:53 2020
@@ -32,6 +32,8 @@ import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
+import org.jsoup.nodes.Document.OutputSettings;
+import org.jsoup.safety.Whitelist;
 
 public class JsoupProcessing {
 
@@ -169,12 +171,12 @@ public class JsoupProcessing {
       }
     }
 
-    if (stripHtml)
-      finalDoc = docToKeep.text();
-    else
-      finalDoc = docToKeep.html();
-    
-    
+    if (stripHtml) {
+      finalDoc = Jsoup.clean(docToKeep.html(),"",Whitelist.none(),new 
OutputSettings().prettyPrint(false));
+    }
+    else {
+      finalDoc = Jsoup.clean(docToKeep.html(),Whitelist.relaxed());
+    }
     metadata.put("extractedDoc",finalDoc);
 
     return metadata;


Reply via email to