Author: kwright
Date: Fri May 10 19:45:39 2019
New Revision: 1859100
URL: http://svn.apache.org/viewvc?rev=1859100&view=rev
Log:
Fix for CONNECTORS-1605
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1859100&r1=1859099&r2=1859100&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri May 10 19:45:39 2019
@@ -3,6 +3,8 @@ $Id$
======================= 2.14-dev =====================
+CONNECTORS-1605: HTML-extractor: Use body tag if specified tag is not
available.
+(Olivier Tavard)
======================= Release 2.13 =====================
Modified:
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1859100&r1=1859099&r2=1859100&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
(original)
+++
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Fri May 10 19:45:39 2019
@@ -154,6 +154,10 @@ public class JsoupProcessing {
// Englobing Tag
if (whitelist!="body"){
docToKeep = doc.select(whitelist).first();
+ // fallback to body tag if the tag chosen is not present in the page
+ if (doc.select(whitelist).size() == 0) {
+ docToKeep = doc.select("body").first();
+ }
}