Author: kwright
Date: Tue Aug 14 17:39:03 2018
New Revision: 1838040
URL: http://svn.apache.org/viewvc?rev=1838040&view=rev
Log:
Fix more formatting and logging statement issues
Modified:
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Modified:
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1838040&r1=1838039&r2=1838040&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
(original)
+++
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Tue Aug 14 17:39:03 2018
@@ -27,7 +27,7 @@ import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
-import org.apache.manifoldcf.core.system.Logging;
+import org.apache.manifoldcf.crawler.system.Logging;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -38,144 +38,142 @@ public class JsoupProcessing {
- public static Hashtable<String,String>
extractTextAndMetadataHtmlDocument(InputStream streamDoc,String
whitelist,List<String> blacklist, boolean stripHtml) throws IOException{
- Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
- Hashtable<String,String> metadata = new
Hashtable<String,String>();
- for(Element meta : doc.select("meta")) {
- Logging.root.warn("Name: " + meta.attr("name") + " -
Content: " + meta.attr("content"));
- metadata.put(meta.attr("name"), meta.attr("content"));
- }
+ public static Hashtable<String,String>
extractTextAndMetadataHtmlDocument(InputStream streamDoc,String
whitelist,List<String> blacklist, boolean stripHtml) throws IOException{
+ Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
+ Hashtable<String,String> metadata = new Hashtable<String,String>();
+ for(Element meta : doc.select("meta")) {
+ Logging.connectors.debug("Name: " + meta.attr("name") + " - Content: " +
meta.attr("content"));
+ metadata.put(meta.attr("name"), meta.attr("content"));
+ }
- if (doc.select("title") != null){
- String title = doc.select("title").text();
- metadata.put("title", title);
- }
+ if (doc.select("title") != null){
+ String title = doc.select("title").text();
+ metadata.put("title", title);
+ }
- Element element_keywords =
doc.select("meta[name='keywords']").first();
- Logging.root.warn("keywordsjsoupnounet");
- if (element_keywords != null) {
- String keywords = (element_keywords.attr("content"));
- Logging.root.warn("keyyyyyywords"+keywords);
- metadata.put("keywords",keywords);
- }
+ Element element_keywords = doc.select("meta[name='keywords']").first();
+ if (element_keywords != null) {
+ String keywords = (element_keywords.attr("content"));
+ metadata.put("keywords",keywords);
+ }
- Element element_description =
doc.select("meta[name=\"description\"]").first();
- if (element_description != null) {
- String description =
(element_description.attr("content"));
- metadata.put("description",description);
- }
+ Element element_description =
doc.select("meta[name=\"description\"]").first();
+ if (element_description != null) {
+ String description = (element_description.attr("content"));
+ metadata.put("description",description);
+ }
- Element element_author =
doc.select("meta[name=\"author\"]").first();
- if (element_author != null) {
- String author = (element_author.attr("content"));
- metadata.put("author",author);
- }
+ Element element_author = doc.select("meta[name=\"author\"]").first();
+ if (element_author != null) {
+ String author = (element_author.attr("content"));
+ metadata.put("author",author);
+ }
- Element element_dcterms_subject =
doc.select("meta[name=\"dcterms.subject\"]").first();
- if (element_dcterms_subject != null) {
- String dc_terms_subject =
(element_dcterms_subject.attr("content"));
- metadata.put("dc_terms_subject",dc_terms_subject);
- }
+ Element element_dcterms_subject =
doc.select("meta[name=\"dcterms.subject\"]").first();
+ if (element_dcterms_subject != null) {
+ String dc_terms_subject = (element_dcterms_subject.attr("content"));
+ metadata.put("dc_terms_subject",dc_terms_subject);
+ }
- Element element_dcterms_title =
doc.select("meta[name=\"dcterms.title\"]").first();
- if (element_dcterms_title != null) {
- String dc_terms_title =
(element_dcterms_title.attr("content"));
- metadata.put("dc_terms_title",dc_terms_title);
+ Element element_dcterms_title =
doc.select("meta[name=\"dcterms.title\"]").first();
+ if (element_dcterms_title != null) {
+ String dc_terms_title = (element_dcterms_title.attr("content"));
+ metadata.put("dc_terms_title",dc_terms_title);
- }
+ }
- Element element_dcterms_creator =
doc.select("meta[name=\"dcterms.creator\"]").first();
- if (element_dcterms_creator != null) {
- String dc_terms_creator =
(element_dcterms_creator.attr("content"));
- metadata.put("dc_terms_creator",dc_terms_creator);
+ Element element_dcterms_creator =
doc.select("meta[name=\"dcterms.creator\"]").first();
+ if (element_dcterms_creator != null) {
+ String dc_terms_creator = (element_dcterms_creator.attr("content"));
+ metadata.put("dc_terms_creator",dc_terms_creator);
- }
+ }
- Element element_dcterms_description =
doc.select("meta[name=\"dcterms.description\"]").first();
- if (element_dcterms_description != null) {
- String dc_terms_description =
(element_dcterms_description.attr("content"));
-
metadata.put("dc_terms_description",dc_terms_description);
+ Element element_dcterms_description =
doc.select("meta[name=\"dcterms.description\"]").first();
+ if (element_dcterms_description != null) {
+ String dc_terms_description =
(element_dcterms_description.attr("content"));
+ metadata.put("dc_terms_description",dc_terms_description);
- }
+ }
- Element element_dcterms_publisher =
doc.select("meta[name=\"dcterms.publisher\"]").first();
- if (element_dcterms_publisher != null) {
- String dc_terms_publisher =
(element_dcterms_publisher.attr("content"));
- metadata.put("dc_terms_publisher",dc_terms_publisher);
+ Element element_dcterms_publisher =
doc.select("meta[name=\"dcterms.publisher\"]").first();
+ if (element_dcterms_publisher != null) {
+ String dc_terms_publisher = (element_dcterms_publisher.attr("content"));
+ metadata.put("dc_terms_publisher",dc_terms_publisher);
- }
+ }
- Element element_dcterms_contributor =
doc.select("meta[name=\"dcterms.contributor\"]").first();
- if (element_dcterms_contributor != null) {
- String dc_terms_contributor =
(element_dcterms_contributor.attr("content"));
-
metadata.put("dc_terms_contributor",dc_terms_contributor);
+ Element element_dcterms_contributor =
doc.select("meta[name=\"dcterms.contributor\"]").first();
+ if (element_dcterms_contributor != null) {
+ String dc_terms_contributor =
(element_dcterms_contributor.attr("content"));
+ metadata.put("dc_terms_contributor",dc_terms_contributor);
- }
+ }
- Element element_dcterms_date =
doc.select("meta[name=\"dcterms.date\"]").first();
- if (element_dcterms_date != null) {
- String dc_terms_date =
(element_dcterms_date.attr("content"));
- metadata.put("dc_terms_date",dc_terms_date);
+ Element element_dcterms_date =
doc.select("meta[name=\"dcterms.date\"]").first();
+ if (element_dcterms_date != null) {
+ String dc_terms_date = (element_dcterms_date.attr("content"));
+ metadata.put("dc_terms_date",dc_terms_date);
- }
+ }
- Element element_dcterms_type =
doc.select("meta[name=\"dcterms.type\"]").first();
- if (element_dcterms_type != null) {
- String dc_terms_type =
(element_dcterms_type.attr("content"));
- metadata.put("dc_terms_type",dc_terms_type);
+ Element element_dcterms_type =
doc.select("meta[name=\"dcterms.type\"]").first();
+ if (element_dcterms_type != null) {
+ String dc_terms_type = (element_dcterms_type.attr("content"));
+ metadata.put("dc_terms_type",dc_terms_type);
- }
+ }
- Element element_dcterms_format =
doc.select("meta[name=\"dcterms.format\"]").first();
- if (element_dcterms_format != null) {
- String dc_terms_format =
(element_dcterms_format.attr("content"));
- metadata.put("dc_terms_format",dc_terms_format);
+ Element element_dcterms_format =
doc.select("meta[name=\"dcterms.format\"]").first();
+ if (element_dcterms_format != null) {
+ String dc_terms_format = (element_dcterms_format.attr("content"));
+ metadata.put("dc_terms_format",dc_terms_format);
- }
+ }
- Element element_dcterms_language =
doc.select("meta[name=\"dcterms.language\"]").first();
- if (element_dcterms_language != null) {
- String dc_terms_language =
(element_dcterms_language.attr("content"));
- metadata.put("dc_terms_language",dc_terms_language);
+ Element element_dcterms_language =
doc.select("meta[name=\"dcterms.language\"]").first();
+ if (element_dcterms_language != null) {
+ String dc_terms_language = (element_dcterms_language.attr("content"));
+ metadata.put("dc_terms_language",dc_terms_language);
- }
+ }
- Element element_dcterms_identifier =
doc.select("meta[name=\"dcterms.identifier\"]").first();
- if (element_dcterms_identifier != null) {
- String dc_terms_identifier =
(element_dcterms_identifier.attr("content"));
- metadata.put("dc_terms_identifier",dc_terms_identifier);
- }
+ Element element_dcterms_identifier =
doc.select("meta[name=\"dcterms.identifier\"]").first();
+ if (element_dcterms_identifier != null) {
+ String dc_terms_identifier =
(element_dcterms_identifier.attr("content"));
+ metadata.put("dc_terms_identifier",dc_terms_identifier);
+ }
- Element docToKeep = doc.body();
- String finalDoc ;
+ Element docToKeep = doc.body();
+ String finalDoc ;
- // Englobing Tag
- if (whitelist!="body"){
- docToKeep = doc.select(whitelist).first();
- }
+ // Englobing Tag
+ if (whitelist!="body"){
+ docToKeep = doc.select(whitelist).first();
+ }
- // Blacklist
- if (blacklist != null){
- for (int i=0; i< blacklist.size();i++){
- docToKeep.select(blacklist.get(i)).remove();
- }
- }
+ // Blacklist
+ if (blacklist != null){
+ for (int i=0; i< blacklist.size();i++){
+ docToKeep.select(blacklist.get(i)).remove();
+ }
+ }
- if (stripHtml)
- finalDoc = docToKeep.text();
- else
- finalDoc = docToKeep.html();
-
-
- metadata.put("extractedDoc",finalDoc);
+ if (stripHtml)
+ finalDoc = docToKeep.text();
+ else
+ finalDoc = docToKeep.html();
+
+
+ metadata.put("extractedDoc",finalDoc);
- return metadata;
- }
+ return metadata;
+ }
}
\ No newline at end of file