Author: markus Date: Mon Jun 25 14:49:03 2012 New Revision: 1353585 URL: http://svn.apache.org/viewvc?rev=1353585&view=rev Log: NUTCH-1407 BasicIndexingFilter to optionally add domain field
Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1353585&r1=1353584&r2=1353585&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Jun 25 14:49:03 2012 @@ -2,6 +2,8 @@ Nutch Change Log (trunk) Current Development: +* NUTCH-1407 BasicIndexingFilter to optionally add domain field (markus) + * NUTCH-1408 RobotRulesParser main doesn't take URL's (markus) * NUTCH-1400 Remove developer -core option for bin/nutch (jnioche) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1353585&r1=1353584&r2=1353585&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Mon Jun 25 14:49:03 2012 @@ -900,6 +900,12 @@ </property> <property> + <name>indexer.add.domain</name> + <value>false</value> + <description>Whether to add the domain field to a NutchDocument.</description> +</property> + +<property> <name>indexer.skip.notmodified</name> <value>false</value> <description>Whether the indexer will skip records with a db_notmodified status. Modified: nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1353585&r1=1353584&r2=1353585&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Mon Jun 25 14:49:03 2012 @@ -26,6 +26,7 @@ import org.apache.nutch.parse.Parse; import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.util.URLUtil; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; @@ -43,6 +44,7 @@ public class BasicIndexingFilter impleme private int MAX_TITLE_LENGTH; private int MAX_CONTENT_LENGTH; + private boolean addDomain = false; private Configuration conf; public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) @@ -60,6 +62,11 @@ public class BasicIndexingFilter impleme } else { u = new URL(urlString); } + + if (addDomain) { + doc.add("domain", URLUtil.getDomainName(u)); + } + host = u.getHost(); } catch (MalformedURLException e) { throw new IndexingException(e); @@ -104,6 +111,7 @@ public class BasicIndexingFilter impleme public void setConf(Configuration conf) { this.conf = conf; this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); + this.addDomain = conf.getBoolean("indexer.add.domain", false); this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1); }