Author: kwright
Date: Tue Aug 14 17:39:03 2018
New Revision: 1838040

URL: http://svn.apache.org/viewvc?rev=1838040&view=rev
Log:
Fix more formatting and logging statement issues

Modified:
    
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java

Modified: 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1838040&r1=1838039&r2=1838040&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 (original)
+++ 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 Tue Aug 14 17:39:03 2018
@@ -27,7 +27,7 @@ import java.util.ArrayList;
 import java.util.Hashtable;
 import java.util.List;
 
-import org.apache.manifoldcf.core.system.Logging;
+import org.apache.manifoldcf.crawler.system.Logging;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -38,144 +38,142 @@ public class JsoupProcessing {
 
 
 
-       public static Hashtable<String,String> 
extractTextAndMetadataHtmlDocument(InputStream streamDoc,String 
whitelist,List<String> blacklist, boolean stripHtml) throws IOException{
-               Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
-               Hashtable<String,String> metadata = new 
Hashtable<String,String>();
-               for(Element meta : doc.select("meta")) {
-                       Logging.root.warn("Name: " + meta.attr("name") + " - 
Content: " + meta.attr("content"));
-                       metadata.put(meta.attr("name"), meta.attr("content"));
-               }
+  public static Hashtable<String,String> 
extractTextAndMetadataHtmlDocument(InputStream streamDoc,String 
whitelist,List<String> blacklist, boolean stripHtml) throws IOException{
+    Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
+    Hashtable<String,String> metadata = new Hashtable<String,String>();
+    for(Element meta : doc.select("meta")) {
+      Logging.connectors.debug("Name: " + meta.attr("name") + " - Content: " + 
meta.attr("content"));
+      metadata.put(meta.attr("name"), meta.attr("content"));
+    }
 
 
-               if (doc.select("title") != null){
-                       String title = doc.select("title").text();
-                       metadata.put("title", title);
-               }
+    if (doc.select("title") != null){
+      String title = doc.select("title").text();
+      metadata.put("title", title);
+    }
 
-               Element element_keywords = 
doc.select("meta[name='keywords']").first();
-               Logging.root.warn("keywordsjsoupnounet");
-               if (element_keywords != null) {
-                       String keywords = (element_keywords.attr("content"));
-                       Logging.root.warn("keyyyyyywords"+keywords);
-                       metadata.put("keywords",keywords);
-               }
+    Element element_keywords = doc.select("meta[name='keywords']").first();
+    if (element_keywords != null) {
+      String keywords = (element_keywords.attr("content"));
+      metadata.put("keywords",keywords);
+    }
 
-               Element element_description = 
doc.select("meta[name=\"description\"]").first();
-               if (element_description != null) {
-                       String description = 
(element_description.attr("content"));
-                       metadata.put("description",description);
-               }
+    Element element_description = 
doc.select("meta[name=\"description\"]").first();
+    if (element_description != null) {
+      String description = (element_description.attr("content"));
+      metadata.put("description",description);
+    }
 
-               Element element_author = 
doc.select("meta[name=\"author\"]").first();
-               if (element_author != null) {
-                       String author = (element_author.attr("content"));
-                       metadata.put("author",author);
-               }
+    Element element_author = doc.select("meta[name=\"author\"]").first();
+    if (element_author != null) {
+      String author = (element_author.attr("content"));
+      metadata.put("author",author);
+    }
 
 
-               Element element_dcterms_subject = 
doc.select("meta[name=\"dcterms.subject\"]").first();
-               if (element_dcterms_subject != null) {
-                       String dc_terms_subject = 
(element_dcterms_subject.attr("content"));
-                       metadata.put("dc_terms_subject",dc_terms_subject);
-               }
+    Element element_dcterms_subject = 
doc.select("meta[name=\"dcterms.subject\"]").first();
+    if (element_dcterms_subject != null) {
+      String dc_terms_subject = (element_dcterms_subject.attr("content"));
+      metadata.put("dc_terms_subject",dc_terms_subject);
+    }
 
 
-               Element element_dcterms_title = 
doc.select("meta[name=\"dcterms.title\"]").first();
-               if (element_dcterms_title != null) {
-                       String dc_terms_title = 
(element_dcterms_title.attr("content"));
-                       metadata.put("dc_terms_title",dc_terms_title);
+    Element element_dcterms_title = 
doc.select("meta[name=\"dcterms.title\"]").first();
+    if (element_dcterms_title != null) {
+      String dc_terms_title = (element_dcterms_title.attr("content"));
+      metadata.put("dc_terms_title",dc_terms_title);
 
-               }
+    }
 
-               Element element_dcterms_creator = 
doc.select("meta[name=\"dcterms.creator\"]").first();
-               if (element_dcterms_creator != null) {
-                       String dc_terms_creator = 
(element_dcterms_creator.attr("content"));
-                       metadata.put("dc_terms_creator",dc_terms_creator);
+    Element element_dcterms_creator = 
doc.select("meta[name=\"dcterms.creator\"]").first();
+    if (element_dcterms_creator != null) {
+      String dc_terms_creator = (element_dcterms_creator.attr("content"));
+      metadata.put("dc_terms_creator",dc_terms_creator);
 
-               }
+    }
 
-               Element element_dcterms_description = 
doc.select("meta[name=\"dcterms.description\"]").first();
-               if (element_dcterms_description != null) {
-                       String dc_terms_description = 
(element_dcterms_description.attr("content"));
-                       
metadata.put("dc_terms_description",dc_terms_description);
+    Element element_dcterms_description = 
doc.select("meta[name=\"dcterms.description\"]").first();
+    if (element_dcterms_description != null) {
+      String dc_terms_description = 
(element_dcterms_description.attr("content"));
+      metadata.put("dc_terms_description",dc_terms_description);
 
-               }
+    }
 
-               Element element_dcterms_publisher = 
doc.select("meta[name=\"dcterms.publisher\"]").first();
-               if (element_dcterms_publisher != null) {
-                       String dc_terms_publisher = 
(element_dcterms_publisher.attr("content"));
-                       metadata.put("dc_terms_publisher",dc_terms_publisher);
+    Element element_dcterms_publisher = 
doc.select("meta[name=\"dcterms.publisher\"]").first();
+    if (element_dcterms_publisher != null) {
+      String dc_terms_publisher = (element_dcterms_publisher.attr("content"));
+      metadata.put("dc_terms_publisher",dc_terms_publisher);
 
-               }
+    }
 
-               Element element_dcterms_contributor = 
doc.select("meta[name=\"dcterms.contributor\"]").first();
-               if (element_dcterms_contributor != null) {
-                       String dc_terms_contributor = 
(element_dcterms_contributor.attr("content"));
-                       
metadata.put("dc_terms_contributor",dc_terms_contributor);
+    Element element_dcterms_contributor = 
doc.select("meta[name=\"dcterms.contributor\"]").first();
+    if (element_dcterms_contributor != null) {
+      String dc_terms_contributor = 
(element_dcterms_contributor.attr("content"));
+      metadata.put("dc_terms_contributor",dc_terms_contributor);
 
-               }
+    }
 
-               Element element_dcterms_date = 
doc.select("meta[name=\"dcterms.date\"]").first();
-               if (element_dcterms_date != null) {
-                       String dc_terms_date = 
(element_dcterms_date.attr("content"));
-                       metadata.put("dc_terms_date",dc_terms_date);
+    Element element_dcterms_date = 
doc.select("meta[name=\"dcterms.date\"]").first();
+    if (element_dcterms_date != null) {
+      String dc_terms_date = (element_dcterms_date.attr("content"));
+      metadata.put("dc_terms_date",dc_terms_date);
 
-               }
+    }
 
-               Element element_dcterms_type = 
doc.select("meta[name=\"dcterms.type\"]").first();
-               if (element_dcterms_type != null) {
-                       String dc_terms_type = 
(element_dcterms_type.attr("content"));
-                       metadata.put("dc_terms_type",dc_terms_type);
+    Element element_dcterms_type = 
doc.select("meta[name=\"dcterms.type\"]").first();
+    if (element_dcterms_type != null) {
+      String dc_terms_type = (element_dcterms_type.attr("content"));
+      metadata.put("dc_terms_type",dc_terms_type);
 
-               }
+    }
 
-               Element element_dcterms_format = 
doc.select("meta[name=\"dcterms.format\"]").first();
-               if (element_dcterms_format != null) {
-                       String dc_terms_format = 
(element_dcterms_format.attr("content"));
-                       metadata.put("dc_terms_format",dc_terms_format);
+    Element element_dcterms_format = 
doc.select("meta[name=\"dcterms.format\"]").first();
+    if (element_dcterms_format != null) {
+      String dc_terms_format = (element_dcterms_format.attr("content"));
+      metadata.put("dc_terms_format",dc_terms_format);
 
-               }
+    }
 
-               Element element_dcterms_language = 
doc.select("meta[name=\"dcterms.language\"]").first();
-               if (element_dcterms_language != null) {
-                       String dc_terms_language = 
(element_dcterms_language.attr("content"));
-                       metadata.put("dc_terms_language",dc_terms_language);
+    Element element_dcterms_language = 
doc.select("meta[name=\"dcterms.language\"]").first();
+    if (element_dcterms_language != null) {
+      String dc_terms_language = (element_dcterms_language.attr("content"));
+      metadata.put("dc_terms_language",dc_terms_language);
 
-               }
+    }
 
-               Element element_dcterms_identifier = 
doc.select("meta[name=\"dcterms.identifier\"]").first();
-               if (element_dcterms_identifier != null) {
-                       String dc_terms_identifier = 
(element_dcterms_identifier.attr("content"));
-                       metadata.put("dc_terms_identifier",dc_terms_identifier);
-               }
+    Element element_dcterms_identifier = 
doc.select("meta[name=\"dcterms.identifier\"]").first();
+    if (element_dcterms_identifier != null) {
+      String dc_terms_identifier = 
(element_dcterms_identifier.attr("content"));
+      metadata.put("dc_terms_identifier",dc_terms_identifier);
+    }
 
 
-               Element docToKeep = doc.body();
-               String finalDoc ;
+    Element docToKeep = doc.body();
+    String finalDoc ;
 
-               // Englobing Tag
-               if (whitelist!="body"){
-                       docToKeep = doc.select(whitelist).first();
-               }
+    // Englobing Tag
+    if (whitelist!="body"){
+      docToKeep = doc.select(whitelist).first();
+    }
 
 
 
-               // Blacklist
-               if (blacklist != null){
-                       for (int i=0; i< blacklist.size();i++){
-                               docToKeep.select(blacklist.get(i)).remove();
-                       }
-               }
+    // Blacklist
+    if (blacklist != null){
+      for (int i=0; i< blacklist.size();i++){
+        docToKeep.select(blacklist.get(i)).remove();
+      }
+    }
 
-               if (stripHtml)
-                       finalDoc = docToKeep.text();
-               else
-                       finalDoc = docToKeep.html();
-               
-               
-               metadata.put("extractedDoc",finalDoc);
+    if (stripHtml)
+      finalDoc = docToKeep.text();
+    else
+      finalDoc = docToKeep.html();
+    
+    
+    metadata.put("extractedDoc",finalDoc);
 
-               return metadata;
-       }
+    return metadata;
+  }
 
 }
\ No newline at end of file


Reply via email to