Author: kwright
Date: Fri May 10 19:45:39 2019
New Revision: 1859100

URL: http://svn.apache.org/viewvc?rev=1859100&view=rev
Log:
Fix for CONNECTORS-1605

Modified:
    manifoldcf/trunk/CHANGES.txt
    
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1859100&r1=1859099&r2=1859100&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri May 10 19:45:39 2019
@@ -3,6 +3,8 @@ $Id$
 
 ======================= 2.14-dev =====================
 
+CONNECTORS-1605: HTML-extractor: Use body tag if specified tag is not 
available.
+(Olivier Tavard)
 
 ======================= Release 2.13 =====================
 

Modified: 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1859100&r1=1859099&r2=1859100&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 (original)
+++ 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 Fri May 10 19:45:39 2019
@@ -154,6 +154,10 @@ public class JsoupProcessing {
     // Englobing Tag
     if (whitelist!="body"){
       docToKeep = doc.select(whitelist).first();
+      // fallback to body tag if the tag chosen is not present in the page
+      if (doc.select(whitelist).size() == 0) {
+        docToKeep = doc.select("body").first();
+      }
     }
 
 


Reply via email to