Author: kwright
Date: Fri Dec 11 17:58:53 2020
New Revision: 1884331
URL: http://svn.apache.org/viewvc?rev=1884331&view=rev
Log:
Improvements for CONNECTORS-1660
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1884331&r1=1884330&r2=1884331&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Dec 11 17:58:53 2020
@@ -3,6 +3,9 @@ $Id$
======================= 2.18-dev =====================
+CONNECTORS-1660: Better handling of non-correctly-formatted HTML.
+(Olivier Tavard)
+
NOTICKET: Add missing Jetty JSP jar so crawler UI works in the examples.
(Karl Wright)
Modified:
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1884331&r1=1884330&r2=1884331&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
(original)
+++
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Fri Dec 11 17:58:53 2020
@@ -32,6 +32,8 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
+import org.jsoup.nodes.Document.OutputSettings;
+import org.jsoup.safety.Whitelist;
public class JsoupProcessing {
@@ -169,12 +171,12 @@ public class JsoupProcessing {
}
}
- if (stripHtml)
- finalDoc = docToKeep.text();
- else
- finalDoc = docToKeep.html();
-
-
+ if (stripHtml) {
+ finalDoc = Jsoup.clean(docToKeep.html(),"",Whitelist.none(),new
OutputSettings().prettyPrint(false));
+ }
+ else {
+ finalDoc = Jsoup.clean(docToKeep.html(),Whitelist.relaxed());
+ }
metadata.put("extractedDoc",finalDoc);
return metadata;