Modified: nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java (original) +++ nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * (number of "hops" from seed URLs). */ package org.apache.nutch.scoring.depth; +
Modified: nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java (original) +++ nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java Thu Jan 29 05:38:59 2015 @@ -32,8 +32,7 @@ import org.apache.nutch.protocol.Content import org.apache.nutch.scoring.ScoringFilter; import org.apache.nutch.scoring.ScoringFilterException; -public class LinkAnalysisScoringFilter - implements ScoringFilter { +public class LinkAnalysisScoringFilter implements ScoringFilter { private Configuration conf; private float normalizedScore = 1.00f; @@ -52,46 +51,44 @@ public class LinkAnalysisScoringFilter } public CrawlDatum distributeScoreToOutlinks(Text fromUrl, - ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, - CrawlDatum adjust, int allCount) - throws ScoringFilterException { + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { return adjust; } public float generatorSortValue(Text url, CrawlDatum datum, float initSort) - throws ScoringFilterException { + throws ScoringFilterException { return datum.getScore() * initSort; } public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, - CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) - throws ScoringFilterException { + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException { return (normalizedScore * dbDatum.getScore()); } public void initialScore(Text url, CrawlDatum datum) - throws ScoringFilterException { + throws ScoringFilterException { datum.setScore(0.0f); } public void injectedScore(Text url, CrawlDatum datum) - throws ScoringFilterException { + throws ScoringFilterException { } public void passScoreAfterParsing(Text url, Content content, Parse parse) - throws ScoringFilterException { - parse.getData().getContentMeta().set(Nutch.SCORE_KEY, - content.getMetadata().get(Nutch.SCORE_KEY)); + throws ScoringFilterException { + parse.getData().getContentMeta() + .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); } public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) - throws ScoringFilterException { + throws ScoringFilterException { content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore()); } public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, - List<CrawlDatum> inlinked) - throws ScoringFilterException { + List<CrawlDatum> inlinked) throws ScoringFilterException { // nothing to do } Modified: nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java (original) +++ nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * {@link org.apache.nutch.scoring.webgraph.WebGraph}. */ package org.apache.nutch.scoring.link; + Modified: nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original) +++ nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Thu Jan 29 05:38:59 2015 @@ -41,17 +41,17 @@ import org.apache.nutch.scoring.ScoringF /** * This plugin implements a variant of an Online Page Importance Computation - * (OPIC) score, described in this paper: - * <a href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/> - * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), - * Adaptive On-Line Page Importance Computation - * </a>. + * (OPIC) score, described in this paper: <a + * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/> + * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive + * On-Line Page Importance Computation </a>. * * @author Andrzej Bialecki */ public class OPICScoringFilter implements ScoringFilter { - private final static Logger LOG = LoggerFactory.getLogger(OPICScoringFilter.class); + private final static Logger LOG = LoggerFactory + .getLogger(OPICScoringFilter.class); private Configuration conf; private float scoreInjected; @@ -72,28 +72,35 @@ public class OPICScoringFilter implement countFiltered = conf.getBoolean("db.score.count.filtered", false); } - public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException { + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException { } - /** Set to 0.0f (unknown value) - inlink contributions will bring it to - * a correct level. Newly discovered pages have at least one inlink. */ - public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException { + /** + * Set to 0.0f (unknown value) - inlink contributions will bring it to a + * correct level. Newly discovered pages have at least one inlink. + */ + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException { datum.setScore(0.0f); } /** Use {@link CrawlDatum#getScore()}. */ - public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException { + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException { return datum.getScore() * initSort; } /** Increase the score by a sum of inlinked scores. */ - public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException { + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List<CrawlDatum> inlinked) throws ScoringFilterException { float adjust = 0.0f; for (int i = 0; i < inlinked.size(); i++) { CrawlDatum linked = inlinked.get(i); adjust += linked.getScore(); } - if (old == null) old = datum; + if (old == null) + old = datum; datum.setScore(old.getScore() + adjust); } @@ -104,11 +111,17 @@ public class OPICScoringFilter implement /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */ public void passScoreAfterParsing(Text url, Content content, Parse parse) { - parse.getData().getContentMeta().set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); + parse.getData().getContentMeta() + .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); } - /** Get a float value from Fetcher.SCORE_KEY, divide it by the number of outlinks and apply. */ - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException { + /** + * Get a float value from Fetcher.SCORE_KEY, divide it by the number of + * outlinks and apply. + */ + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { float score = scoreInjected; String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY); if (scoreString != null) { @@ -135,7 +148,7 @@ public class OPICScoringFilter implement try { String toHost = new URL(target.getKey().toString()).getHost(); String fromHost = new URL(fromUrl.toString()).getHost(); - if(toHost.equalsIgnoreCase(fromHost)){ + if (toHost.equalsIgnoreCase(fromHost)) { target.getValue().setScore(internalScore); } else { target.getValue().setScore(externalScore); @@ -151,8 +164,10 @@ public class OPICScoringFilter implement return adjust; } - /** Dampen the boost value by scorePower.*/ - public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { - return (float)Math.pow(dbDatum.getScore(), scorePower) * initScore; + /** Dampen the boost value by scorePower. */ + public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException { + return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore; } } Modified: nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java (original) +++ nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * (OPIC) algorithm. */ package org.apache.nutch.scoring.opic; + Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (original) +++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Thu Jan 29 05:38:59 2015 @@ -52,22 +52,24 @@ public class CollectionManager extends C transient Map<String, Subcollection> collectionMap = new HashMap<String, Subcollection>(); transient URL configfile; - + public CollectionManager(Configuration conf) { super(conf); init(); } - - /** + + /** * Used for testing */ - protected CollectionManager(){ + protected CollectionManager() { super(NutchConfiguration.create()); } - protected void init(){ + protected void init() { try { - if (LOG.isInfoEnabled()) { LOG.info("initializing CollectionManager"); } + if (LOG.isInfoEnabled()) { + LOG.info("initializing CollectionManager"); + } // initialize known subcollections configfile = getConf().getResource( getConf().get("subcollections.config", DEFAULT_FILE_NAME)); @@ -92,7 +94,7 @@ public class CollectionManager extends C if (LOG.isInfoEnabled()) { LOG.info("file has " + nodeList.getLength() + " elements"); } - + for (int i = 0; i < nodeList.getLength(); i++) { Element scElem = (Element) nodeList.item(i); Subcollection subCol = new Subcollection(getConf()); @@ -103,18 +105,18 @@ public class CollectionManager extends C LOG.info("Cannot find collections"); } } - + public static CollectionManager getCollectionManager(Configuration conf) { String key = "collectionmanager"; ObjectCache objectCache = ObjectCache.get(conf); - CollectionManager impl = (CollectionManager)objectCache.getObject(key); + CollectionManager impl = (CollectionManager) objectCache.getObject(key); if (impl == null) { try { if (LOG.isInfoEnabled()) { LOG.info("Instantiating CollectionManager"); } - impl=new CollectionManager(conf); - objectCache.setObject(key,impl); + impl = new CollectionManager(conf); + objectCache.setObject(key, impl); } catch (Exception e) { throw new RuntimeException("Couldn't create CollectionManager", e); } @@ -165,7 +167,7 @@ public class CollectionManager extends C /** * Return names of collections url is part of - * + * * @param url * The url to test against Collections * @return Subcollections @@ -203,8 +205,8 @@ public class CollectionManager extends C */ public void save() throws IOException { try { - final FileOutputStream fos = new FileOutputStream(new File(configfile - .getFile())); + final FileOutputStream fos = new FileOutputStream(new File( + configfile.getFile())); final Document doc = new DocumentImpl(); final Element collections = doc .createElement(Subcollection.TAG_COLLECTIONS); Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (original) +++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Thu Jan 29 05:38:59 2015 @@ -32,20 +32,20 @@ import org.w3c.dom.NodeList; * SubCollection represents a subset of index, you can define url patterns that * will indicate that particular page (url) is part of SubCollection. */ -public class Subcollection extends Configured implements URLFilter{ - - public static final String TAG_COLLECTIONS="subcollections"; - public static final String TAG_COLLECTION="subcollection"; - public static final String TAG_WHITELIST="whitelist"; - public static final String TAG_BLACKLIST="blacklist"; - public static final String TAG_NAME="name"; - public static final String TAG_KEY="key"; - public static final String TAG_ID="id"; +public class Subcollection extends Configured implements URLFilter { + + public static final String TAG_COLLECTIONS = "subcollections"; + public static final String TAG_COLLECTION = "subcollection"; + public static final String TAG_WHITELIST = "whitelist"; + public static final String TAG_BLACKLIST = "blacklist"; + public static final String TAG_NAME = "name"; + public static final String TAG_KEY = "key"; + public static final String TAG_ID = "id"; List<String> blackList = new ArrayList<String>(); List<String> whiteList = new ArrayList<String>(); - /** + /** * SubCollection identifier */ String id; @@ -55,12 +55,12 @@ public class Subcollection extends Confi */ String key; - /** + /** * SubCollection name */ String name; - /** + /** * SubCollection whitelist as String */ String wlString; @@ -70,31 +70,37 @@ public class Subcollection extends Confi */ String blString; - /** public Constructor + /** + * public Constructor * - * @param id id of SubCollection - * @param name name of SubCollection + * @param id + * id of SubCollection + * @param name + * name of SubCollection */ public Subcollection(String id, String name, Configuration conf) { this(id, name, null, conf); } - /** public Constructor - * - * @param id id of SubCollection - * @param name name of SubCollection + /** + * public Constructor + * + * @param id + * id of SubCollection + * @param name + * name of SubCollection */ public Subcollection(String id, String name, String key, Configuration conf) { this(conf); - this.id=id; + this.id = id; this.key = key; this.name = name; } - public Subcollection(Configuration conf){ + public Subcollection(Configuration conf) { super(conf); } - + /** * @return Returns the name */ @@ -232,7 +238,8 @@ public class Subcollection extends Confi /** * Set contents of blacklist from String * - * @param list the blacklist contents + * @param list + * the blacklist contents */ public void setBlackList(String list) { this.blString = list; @@ -242,7 +249,8 @@ public class Subcollection extends Confi /** * Set contents of whitelist from String * - * @param list the whitelist contents + * @param list + * the whitelist contents */ public void setWhiteList(String list) { this.wlString = list; Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (original) +++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -35,21 +35,22 @@ import org.apache.nutch.collection.Subco import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; - -public class SubcollectionIndexingFilter extends Configured implements IndexingFilter { +public class SubcollectionIndexingFilter extends Configured implements + IndexingFilter { private Configuration conf; - public SubcollectionIndexingFilter(){ + public SubcollectionIndexingFilter() { super(NutchConfiguration.create()); } - + public SubcollectionIndexingFilter(Configuration conf) { super(conf); } - + /** - * @param Configuration conf + * @param Configuration + * conf */ public void setConf(Configuration conf) { this.conf = conf; @@ -63,7 +64,6 @@ public class SubcollectionIndexingFilter return this.conf; } - /** * Doc field name */ @@ -72,7 +72,8 @@ public class SubcollectionIndexingFilter /** * Logger */ - public static final Logger LOG = LoggerFactory.getLogger(SubcollectionIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(SubcollectionIndexingFilter.class); /** * "Mark" document to be a part of subcollection @@ -81,7 +82,8 @@ public class SubcollectionIndexingFilter * @param url */ private void addSubCollectionField(NutchDocument doc, String url) { - for (Subcollection coll : CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) { + for (Subcollection coll : CollectionManager.getCollectionManager(getConf()) + .getSubCollections(url)) { if (coll.getKey() == null) { doc.add(fieldName, coll.getName()); } else { @@ -90,7 +92,8 @@ public class SubcollectionIndexingFilter } } - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { String sUrl = url.toString(); addSubCollectionField(doc, sUrl); return doc; Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java (original) +++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java Thu Jan 29 05:38:59 2015 @@ -22,3 +22,4 @@ * {@link org.apache.nutch.collection}. */ package org.apache.nutch.indexer.subcollection; + Modified: nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java (original) +++ nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Thu Jan 29 05:38:59 2015 @@ -25,31 +25,34 @@ import org.junit.Assert; import org.junit.Test; public class TestSubcollection { - - /**Test filtering logic + + /** + * Test filtering logic * * @throws Exception */ @Test public void testFilter() throws Exception { - Subcollection sc=new Subcollection(NutchConfiguration.create()); + Subcollection sc = new Subcollection(NutchConfiguration.create()); sc.setWhiteList("www.nutch.org\nwww.apache.org"); sc.setBlackList("jpg\nwww.apache.org/zecret/"); - - //matches whitelist - Assert.assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html")); - - //matches blacklist - Assert.assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html")); + + // matches whitelist + Assert.assertEquals("http://www.apache.org/index.html", + sc.filter("http://www.apache.org/index.html")); + + // matches blacklist + Assert.assertEquals(null, + sc.filter("http://www.apache.org/zecret/index.html")); Assert.assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg")); - - //no match + + // no match Assert.assertEquals(null, sc.filter("http://www.google.com/")); } - + @Test - public void testInput(){ - StringBuffer xml=new StringBuffer(); + public void testInput() { + StringBuffer xml = new StringBuffer(); xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); xml.append("<!-- just a comment -->"); xml.append("<subcollections>"); @@ -65,44 +68,45 @@ public class TestSubcollection { xml.append("</blacklist>"); xml.append("</subcollection>"); xml.append("</subcollections>"); - - InputStream is=new ByteArrayInputStream(xml.toString().getBytes()); - - CollectionManager cm=new CollectionManager(); + + InputStream is = new ByteArrayInputStream(xml.toString().getBytes()); + + CollectionManager cm = new CollectionManager(); cm.parse(is); - - Collection<?> c=cm.getAll(); - + + Collection<?> c = cm.getAll(); + // test that size matches - Assert.assertEquals(1,c.size()); - - Subcollection collection=(Subcollection)c.toArray()[0]; - - //test collection id + Assert.assertEquals(1, c.size()); + + Subcollection collection = (Subcollection) c.toArray()[0]; + + // test collection id Assert.assertEquals("nutch", collection.getId()); - - //test collection name + + // test collection name Assert.assertEquals("nutch collection", collection.getName()); - //test whitelist - Assert.assertEquals(2,collection.whiteList.size()); - - String wlUrl=(String)collection.whiteList.get(0); + // test whitelist + Assert.assertEquals(2, collection.whiteList.size()); + + String wlUrl = (String) collection.whiteList.get(0); Assert.assertEquals("http://lucene.apache.org/nutch/", wlUrl); - wlUrl=(String)collection.whiteList.get(1); + wlUrl = (String) collection.whiteList.get(1); Assert.assertEquals("http://wiki.apache.org/nutch/", wlUrl); - - //matches whitelist - Assert.assertEquals("http://lucene.apache.org/nutch/", collection.filter("http://lucene.apache.org/nutch/")); - //test blacklist - Assert.assertEquals(1,collection.blackList.size()); + // matches whitelist + Assert.assertEquals("http://lucene.apache.org/nutch/", + collection.filter("http://lucene.apache.org/nutch/")); + + // test blacklist + Assert.assertEquals(1, collection.blackList.size()); - String blUrl=(String)collection.blackList.get(0); + String blUrl = (String) collection.blackList.get(0); Assert.assertEquals("http://www.xxx.yyy", blUrl); - //no match + // no match Assert.assertEquals(null, collection.filter("http://www.google.com/")); } } Modified: nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java (original) +++ nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -34,23 +34,25 @@ import org.apache.nutch.util.domain.Doma /** * Adds the Top level domain extensions to the index + * * @author Enis Soztutar <[email protected]> */ public class TLDIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory.getLogger(TLDIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(TLDIndexingFilter.class); private Configuration conf; - public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks) - throws IndexingException { + public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { try { URL url = new URL(urlText.toString()); DomainSuffix d = URLUtil.getDomainSuffix(url); - + doc.add("tld", d.getDomain()); - - }catch (Exception ex) { + + } catch (Exception ex) { LOG.warn(ex.toString()); } Modified: nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java (original) +++ nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java Thu Jan 29 05:38:59 2015 @@ -35,9 +35,9 @@ import org.apache.nutch.scoring.ScoringF import org.apache.nutch.util.domain.DomainSuffix; import org.apache.nutch.util.domain.DomainSuffixes; - /** * Scoring filter to boost tlds. + * * @author Enis Soztutar <[email protected]> */ public class TLDScoringFilter implements ScoringFilter { @@ -56,10 +56,10 @@ public class TLDScoringFilter implements NutchField tlds = doc.getField("tld"); float boost = 1.0f; - if(tlds != null) { - for(Object tld : tlds.getValues()) { + if (tlds != null) { + for (Object tld : tlds.getValues()) { DomainSuffix entry = tldEntries.get(tld.toString()); - if(entry != null) + if (entry != null) boost *= entry.getBoost(); } } @@ -93,9 +93,8 @@ public class TLDScoringFilter implements throws ScoringFilterException { } - public void updateDbScore(Text url, CrawlDatum old, - CrawlDatum datum, List<CrawlDatum> inlinked) - throws ScoringFilterException { + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List<CrawlDatum> inlinked) throws ScoringFilterException { } public Configuration getConf() { @@ -105,9 +104,10 @@ public class TLDScoringFilter implements public void setConf(Configuration conf) { this.conf = conf; } - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, - Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, - int allCount) throws ScoringFilterException { + + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { return adjust; } Modified: nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java Thu Jan 29 05:38:59 2015 @@ -32,12 +32,11 @@ import org.apache.nutch.net.*; import org.apache.nutch.urlfilter.api.RegexRule; import org.apache.nutch.urlfilter.api.RegexURLFilterBase; - /** - * RegexURLFilterBase implementation based on the - * <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> - * Finite-State Automata for Java<sup>TM</sup>. - * + * RegexURLFilterBase implementation based on the <a + * href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State + * Automata for Java<sup>TM</sup>. + * * @author Jérôme Charron * @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> */ @@ -49,24 +48,24 @@ public class AutomatonURLFilter extends super(); } - public AutomatonURLFilter(String filename) - throws IOException, PatternSyntaxException { + public AutomatonURLFilter(String filename) throws IOException, + PatternSyntaxException { super(filename); } - AutomatonURLFilter(Reader reader) - throws IOException, IllegalArgumentException { + AutomatonURLFilter(Reader reader) throws IOException, + IllegalArgumentException { super(reader); } - - /* ----------------------------------- * - * <implementation:RegexURLFilterBase> * - * ----------------------------------- */ - + /* + * ----------------------------------- * <implementation:RegexURLFilterBase> * + * ----------------------------------- + */ + /** - * Rules specified as a config property will override rules specified - * as a config file. + * Rules specified as a config property will override rules specified as a + * config file. */ protected Reader getRulesReader(Configuration conf) throws IOException { String stringRules = conf.get(URLFILTER_AUTOMATON_RULES); @@ -81,21 +80,20 @@ public class AutomatonURLFilter extends protected RegexRule createRule(boolean sign, String regex) { return new Rule(sign, regex); } - - /* ------------------------------------ * - * </implementation:RegexURLFilterBase> * - * ------------------------------------ */ - + /* + * ------------------------------------ * </implementation:RegexURLFilterBase> + * * ------------------------------------ + */ + public static void main(String args[]) throws IOException { main(new AutomatonURLFilter(), args); } - private class Rule extends RegexRule { - + private RunAutomaton automaton; - + Rule(boolean sign, String regex) { super(sign, regex); automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton()); @@ -105,5 +103,5 @@ public class AutomatonURLFilter extends return automaton.run(url); } } - + } Modified: nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java Thu Jan 29 05:38:59 2015 @@ -26,10 +26,9 @@ import org.apache.nutch.urlfilter.api.Re import org.junit.Assert; import org.junit.Test; - /** * JUnit based test of class <code>AutomatonURLFilter</code>. - * + * * @author Jérôme Charron */ public class TestAutomatonURLFilter extends RegexURLFilterBaseTest { @@ -42,7 +41,7 @@ public class TestAutomatonURLFilter exte return null; } } - + @Test public void test() { test("WholeWebCrawling"); Modified: nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Thu Jan 29 05:38:59 2015 @@ -35,35 +35,48 @@ import org.apache.nutch.util.URLUtil; import org.apache.nutch.util.domain.DomainSuffix; /** - * <p>Filters URLs based on a file containing domain suffixes, domain names, and + * <p> + * Filters URLs based on a file containing domain suffixes, domain names, and * hostnames. Only a url that matches one of the suffixes, domains, or hosts - * present in the file is allowed.</p> + * present in the file is allowed. + * </p> * - * <p>Urls are checked in order of domain suffix, domain name, and hostname - * against entries in the domain file. The domain file would be setup as follows - * with one entry per line: - * - * <pre> com apache.org www.apache.org </pre> - * - * <p>The first line is an example of a filter that would allow all .com - * domains. The second line allows all urls from apache.org and all of its - * subdomains such as lucene.apache.org and hadoop.apache.org. The third line - * would allow only urls from www.apache.org. There is no specific ordering to - * entries. The entries are from more general to more specific with the more - * general overridding the more specific.</p> + * <p> + * Urls are checked in order of domain suffix, domain name, and hostname against + * entries in the domain file. The domain file would be setup as follows with + * one entry per line: + * + * <pre> + * com apache.org www.apache.org + * </pre> + * + * <p> + * The first line is an example of a filter that would allow all .com domains. + * The second line allows all urls from apache.org and all of its subdomains + * such as lucene.apache.org and hadoop.apache.org. The third line would allow + * only urls from www.apache.org. There is no specific ordering to entries. The + * entries are from more general to more specific with the more general + * overridding the more specific. + * </p> * * The domain file defaults to domain-urlfilter.txt in the classpath but can be * overridden using the: * - * <ul> <ol>property "urlfilter.domain.file" in ./conf/nutch-*.xml, and</ol> - * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul> + * <ul> + * <ol> + * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and + * </ol> + * <ol> + * attribute "file" in plugin.xml of this plugin + * </ol> + * </ul> * * the attribute "file" has higher precedence if defined. */ -public class DomainURLFilter - implements URLFilter { +public class DomainURLFilter implements URLFilter { - private static final Logger LOG = LoggerFactory.getLogger(DomainURLFilter.class); + private static final Logger LOG = LoggerFactory + .getLogger(DomainURLFilter.class); // read in attribute "file" of this plugin. private static String attributeFile = null; @@ -71,8 +84,7 @@ public class DomainURLFilter private String domainFile = null; private Set<String> domainSet = new LinkedHashSet<String>(); - private void readConfiguration(Reader configReader) - throws IOException { + private void readConfiguration(Reader configReader) throws IOException { // read the configuration file, line by line BufferedReader reader = new BufferedReader(configReader); @@ -95,7 +107,8 @@ public class DomainURLFilter /** * Constructor that specifies the domain file to use. * - * @param domainFile The domain file, overrides domain-urlfilter.text default. + * @param domainFile + * The domain file, overrides domain-urlfilter.text default. * * @throws IOException */ @@ -111,8 +124,8 @@ public class DomainURLFilter // get the extensions for domain urlfilter String pluginName = "urlfilter-domain"; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( - URLFilter.class.getName()).getExtensions(); + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; if (extension.getDescriptor().getPluginId().equals(pluginName)) { @@ -120,32 +133,30 @@ public class DomainURLFilter break; } } - + // handle blank non empty input if (attributeFile != null && attributeFile.trim().equals("")) { attributeFile = null; } - + if (attributeFile != null) { if (LOG.isInfoEnabled()) { LOG.info("Attribute \"file\" is defined for plugin " + pluginName - + " as " + attributeFile); + + " as " + attributeFile); } - } - else { + } else { if (LOG.isWarnEnabled()) { LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " - + pluginName); + + pluginName); } } // domain file and attribute "file" take precedence if defined - String file = conf.get("urlfilter.domain.file"); + String file = conf.get("urlfilter.domain.file"); String stringRules = conf.get("urlfilter.domain.rules"); if (domainFile != null) { file = domainFile; - } - else if (attributeFile != null) { + } else if (attributeFile != null) { file = attributeFile; } Reader reader = null; @@ -159,8 +170,7 @@ public class DomainURLFilter reader = new FileReader(file); } readConfiguration(reader); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); } } @@ -173,7 +183,7 @@ public class DomainURLFilter try { - // match for suffix, domain, and host in that order. more general will + // match for suffix, domain, and host in that order. more general will // override more specific String domain = URLUtil.getDomainName(url).toLowerCase().trim(); String host = URLUtil.getHost(url); @@ -182,20 +192,19 @@ public class DomainURLFilter if (domainSuffix != null) { suffix = domainSuffix.getDomain(); } - + if (domainSet.contains(suffix) || domainSet.contains(domain) - || domainSet.contains(host)) { + || domainSet.contains(host)) { return url; } // doesn't match, don't allow return null; - } - catch (Exception e) { - + } catch (Exception e) { + // if an error happens, allow the url to pass LOG.error("Could not apply filter on url: " + url + "\n" - + org.apache.hadoop.util.StringUtils.stringifyException(e)); + + org.apache.hadoop.util.StringUtils.stringifyException(e)); return null; } } Modified: nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Thu Jan 29 05:38:59 2015 @@ -23,13 +23,11 @@ import org.junit.Test; public class TestDomainURLFilter { - private final static String SEPARATOR = System.getProperty("file.separator"); private final static String SAMPLES = System.getProperty("test.data", "."); @Test - public void testFilter() - throws Exception { + public void testFilter() throws Exception { String domainFile = SAMPLES + SEPARATOR + "hosts.txt"; Configuration conf = NutchConfiguration.create(); Modified: nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java Thu Jan 29 05:38:59 2015 @@ -35,35 +35,48 @@ import org.apache.nutch.util.URLUtil; import org.apache.nutch.util.domain.DomainSuffix; /** - * <p>Filters URLs based on a file containing domain suffixes, domain names, and - * hostnames. A url that matches one of the suffixes, domains, or hosts - * present in the file is filtered out.</p> - * - * <p>Urls are checked in order of domain suffix, domain name, and hostname - * against entries in the domain file. The domain file would be setup as follows - * with one entry per line: - * - * <pre> com apache.org www.apache.org </pre> - * - * <p>The first line is an example of a filter that would allow all .com - * domains. The second line allows all urls from apache.org and all of its - * subdomains such as lucene.apache.org and hadoop.apache.org. The third line - * would allow only urls from www.apache.org. There is no specific ordering to - * entries. The entries are from more general to more specific with the more - * general overridding the more specific.</p> - * - * The domain file defaults to domainblacklist-urlfilter.txt in the classpath but can be - * overridden using the: - * - * <ul> <ol>property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and</ol> - * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul> + * <p> + * Filters URLs based on a file containing domain suffixes, domain names, and + * hostnames. A url that matches one of the suffixes, domains, or hosts present + * in the file is filtered out. + * </p> + * + * <p> + * Urls are checked in order of domain suffix, domain name, and hostname against + * entries in the domain file. The domain file would be setup as follows with + * one entry per line: + * + * <pre> + * com apache.org www.apache.org + * </pre> + * + * <p> + * The first line is an example of a filter that would allow all .com domains. + * The second line allows all urls from apache.org and all of its subdomains + * such as lucene.apache.org and hadoop.apache.org. The third line would allow + * only urls from www.apache.org. There is no specific ordering to entries. The + * entries are from more general to more specific with the more general + * overridding the more specific. + * </p> + * + * The domain file defaults to domainblacklist-urlfilter.txt in the classpath + * but can be overridden using the: + * + * <ul> + * <ol> + * property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and + * </ol> + * <ol> + * attribute "file" in plugin.xml of this plugin + * </ol> + * </ul> * * the attribute "file" has higher precedence if defined. */ -public class DomainBlacklistURLFilter - implements URLFilter { +public class DomainBlacklistURLFilter implements URLFilter { - private static final Logger LOG = LoggerFactory.getLogger(DomainBlacklistURLFilter.class); + private static final Logger LOG = LoggerFactory + .getLogger(DomainBlacklistURLFilter.class); // read in attribute "file" of this plugin. private static String attributeFile = null; @@ -71,8 +84,7 @@ public class DomainBlacklistURLFilter private String domainFile = null; private Set<String> domainSet = new LinkedHashSet<String>(); - private void readConfiguration(Reader configReader) - throws IOException { + private void readConfiguration(Reader configReader) throws IOException { // read the configuration file, line by line BufferedReader reader = new BufferedReader(configReader); @@ -95,7 +107,8 @@ public class DomainBlacklistURLFilter /** * Constructor that specifies the domain file to use. * - * @param domainFile The domain file, overrides domainblacklist-urlfilter.text default. + * @param domainFile + * The domain file, overrides domainblacklist-urlfilter.text default. * * @throws IOException */ @@ -111,8 +124,8 @@ public class DomainBlacklistURLFilter // get the extensions for domain urlfilter String pluginName = "urlfilter-domainblacklist"; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( - URLFilter.class.getName()).getExtensions(); + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; if (extension.getDescriptor().getPluginId().equals(pluginName)) { @@ -129,23 +142,21 @@ public class DomainBlacklistURLFilter if (attributeFile != null) { if (LOG.isInfoEnabled()) { LOG.info("Attribute \"file\" is defined for plugin " + pluginName - + " as " + attributeFile); + + " as " + attributeFile); } - } - else { + } else { if (LOG.isWarnEnabled()) { LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " - + pluginName); + + pluginName); } } // domain file and attribute "file" take precedence if defined - String file = conf.get("urlfilter.domainblacklist.file"); + String file = conf.get("urlfilter.domainblacklist.file"); String stringRules = conf.get("urlfilter.domainblacklist.rules"); if (domainFile != null) { file = domainFile; - } - else if (attributeFile != null) { + } else if (attributeFile != null) { file = attributeFile; } Reader reader = null; @@ -159,8 +170,7 @@ public class DomainBlacklistURLFilter reader = new FileReader(file); } readConfiguration(reader); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); } } @@ -171,7 +181,7 @@ public class DomainBlacklistURLFilter public String filter(String url) { try { - // match for suffix, domain, and host in that order. more general will + // match for suffix, domain, and host in that order. more general will // override more specific String domain = URLUtil.getDomainName(url).toLowerCase().trim(); String host = URLUtil.getHost(url); @@ -182,19 +192,18 @@ public class DomainBlacklistURLFilter } if (domainSet.contains(suffix) || domainSet.contains(domain) - || domainSet.contains(host)) { + || domainSet.contains(host)) { // Matches, filter! return null; } // doesn't match, allow return url; - } - catch (Exception e) { + } catch (Exception e) { // if an error happens, allow the url to pass LOG.error("Could not apply filter on url: " + url + "\n" - + org.apache.hadoop.util.StringUtils.stringifyException(e)); + + org.apache.hadoop.util.StringUtils.stringifyException(e)); return null; } } Modified: nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java Thu Jan 29 05:38:59 2015 @@ -27,12 +27,12 @@ public class TestDomainBlacklistURLFilte private final static String SAMPLES = System.getProperty("test.data", "."); @Test - public void testFilter() - throws Exception { + public void testFilter() throws Exception { String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt"; Configuration conf = NutchConfiguration.create(); - DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(domainBlacklistFile); + DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter( + domainBlacklistFile); domainBlacklistFilter.setConf(conf); Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org")); Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org")); Modified: nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java Thu Jan 29 05:38:59 2015 @@ -39,16 +39,19 @@ import java.util.List; import java.util.ArrayList; /** - * Filters URLs based on a file of URL prefixes. The file is named by - * (1) property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and - * (2) attribute "file" in plugin.xml of this plugin - * Attribute "file" has higher precedence if defined. - * - * <p>The format of this file is one URL prefix per line.</p> + * Filters URLs based on a file of URL prefixes. The file is named by (1) + * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2) + * attribute "file" in plugin.xml of this plugin Attribute "file" has higher + * precedence if defined. + * + * <p> + * The format of this file is one URL prefix per line. + * </p> */ public class PrefixURLFilter implements URLFilter { - private static final Logger LOG = LoggerFactory.getLogger(PrefixURLFilter.class); + private static final Logger LOG = LoggerFactory + .getLogger(PrefixURLFilter.class); // read in attribute "file" of this plugin. private static String attributeFile = null; @@ -58,7 +61,7 @@ public class PrefixURLFilter implements private Configuration conf; public PrefixURLFilter() throws IOException { - + } public PrefixURLFilter(String stringRules) throws IOException { @@ -72,43 +75,43 @@ public class PrefixURLFilter implements return url; } - private TrieStringMatcher readConfiguration(Reader reader) - throws IOException { - - BufferedReader in=new BufferedReader(reader); + private TrieStringMatcher readConfiguration(Reader reader) throws IOException { + + BufferedReader in = new BufferedReader(reader); List<String> urlprefixes = new ArrayList<String>(); String line; - while((line=in.readLine())!=null) { + while ((line = in.readLine()) != null) { if (line.length() == 0) continue; - char first=line.charAt(0); + char first = line.charAt(0); switch (first) { - case ' ' : case '\n' : case '#' : // skip blank & comment lines + case ' ': + case '\n': + case '#': // skip blank & comment lines continue; - default : - urlprefixes.add(line); + default: + urlprefixes.add(line); } } return new PrefixStringMatcher(urlprefixes); } - public static void main(String args[]) - throws IOException { - + public static void main(String args[]) throws IOException { + PrefixURLFilter filter; if (args.length >= 1) filter = new PrefixURLFilter(args[0]); else filter = new PrefixURLFilter(); - - BufferedReader in=new BufferedReader(new InputStreamReader(System.in)); + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; - while((line=in.readLine())!=null) { - String out=filter.filter(line); - if(out!=null) { + while ((line = in.readLine()) != null) { + String out = filter.filter(line); + if (out != null) { System.out.println(out); } } @@ -118,8 +121,8 @@ public class PrefixURLFilter implements this.conf = conf; String pluginName = "urlfilter-prefix"; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( - URLFilter.class.getName()).getExtensions(); + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; if (extension.getDescriptor().getPluginId().equals(pluginName)) { @@ -136,8 +139,8 @@ public class PrefixURLFilter implements } } else { // if (LOG.isWarnEnabled()) { - // LOG.warn("Attribute \"file\" is not defined in plugin.xml for - // plugin "+pluginName); + // LOG.warn("Attribute \"file\" is not defined in plugin.xml for + // plugin "+pluginName); // } } @@ -159,7 +162,9 @@ public class PrefixURLFilter implements try { trie = readConfiguration(reader); } catch (IOException e) { - if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } // TODO [email protected]: throw Exception? Because broken api. throw new RuntimeException(e.getMessage(), e); } @@ -169,5 +174,5 @@ public class PrefixURLFilter implements public Configuration getConf() { return this.conf; } - + } Modified: nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java Thu Jan 29 05:38:59 2015 @@ -28,13 +28,12 @@ import org.apache.nutch.urlfilter.api.Re import org.apache.nutch.urlfilter.api.RegexURLFilterBase; import org.apache.nutch.util.NutchConfiguration; - /** * Filters URLs based on a file of regular expressions using the * {@link java.util.regex Java Regex implementation}. */ public class RegexURLFilter extends RegexURLFilterBase { - + public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file"; public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules"; @@ -42,24 +41,23 @@ public class RegexURLFilter extends Rege super(); } - public RegexURLFilter(String filename) - throws IOException, PatternSyntaxException { + public RegexURLFilter(String filename) throws IOException, + PatternSyntaxException { super(filename); } - RegexURLFilter(Reader reader) - throws IOException, IllegalArgumentException { + RegexURLFilter(Reader reader) throws IOException, IllegalArgumentException { super(reader); } - - /* ----------------------------------- * - * <implementation:RegexURLFilterBase> * - * ----------------------------------- */ - + /* + * ----------------------------------- * <implementation:RegexURLFilterBase> * + * ----------------------------------- + */ + /** - * Rules specified as a config property will override rules specified - * as a config file. + * Rules specified as a config property will override rules specified as a + * config file. */ protected Reader getRulesReader(Configuration conf) throws IOException { String stringRules = conf.get(URLFILTER_REGEX_RULES); @@ -74,23 +72,22 @@ public class RegexURLFilter extends Rege protected RegexRule createRule(boolean sign, String regex) { return new Rule(sign, regex); } - - /* ------------------------------------ * - * </implementation:RegexURLFilterBase> * - * ------------------------------------ */ - + /* + * ------------------------------------ * </implementation:RegexURLFilterBase> + * * ------------------------------------ + */ + public static void main(String args[]) throws IOException { RegexURLFilter filter = new RegexURLFilter(); filter.setConf(NutchConfiguration.create()); main(filter, args); } - private class Rule extends RegexRule { - + private Pattern pattern; - + Rule(boolean sign, String regex) { super(sign, regex); pattern = Pattern.compile(regex); @@ -100,5 +97,5 @@ public class RegexURLFilter extends Rege return pattern.matcher(url).find(); } } - + } Modified: nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java Thu Jan 29 05:38:59 2015 @@ -26,15 +26,13 @@ import org.apache.nutch.urlfilter.api.Re import org.junit.Assert; import org.junit.Test; - /** * JUnit based test of class <code>RegexURLFilter</code>. - * + * * @author Jérôme Charron */ public class TestRegexURLFilter extends RegexURLFilterBaseTest { - protected URLFilter getURLFilter(Reader rules) { try { return new RegexURLFilter(rules); @@ -43,7 +41,7 @@ public class TestRegexURLFilter extends return null; } } - + @Test public void test() { test("WholeWebCrawling"); Modified: nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java Thu Jan 29 05:38:59 2015 @@ -51,14 +51,15 @@ import java.net.MalformedURLException; * Attribute "file" has higher precedence if defined. If the config file is * missing, all URLs will be rejected. * - * <p>This filter can be configured to work in one of two modes: + * <p> + * This filter can be configured to work in one of two modes: * <ul> - * <li><b>default to reject</b> ('-'): in this mode, only URLs that match suffixes - * specified in the config file will be accepted, all other URLs will be - * rejected.</li> - * <li><b>default to accept</b> ('+'): in this mode, only URLs that match suffixes - * specified in the config file will be rejected, all other URLs will be - * accepted.</li> + * <li><b>default to reject</b> ('-'): in this mode, only URLs that match + * suffixes specified in the config file will be accepted, all other URLs will + * be rejected.</li> + * <li><b>default to accept</b> ('+'): in this mode, only URLs that match + * suffixes specified in the config file will be rejected, all other URLs will + * be accepted.</li> * </ul> * <p> * The format of this config file is one URL suffix per line, with no preceding @@ -67,10 +68,10 @@ import java.net.MalformedURLException; * </p> * <p> * A single '+' or '-' sign not followed by any suffix must be used once, to - * signify the mode this plugin operates in. An optional single 'I' can be appended, - * to signify that suffix matches should be case-insensitive. The default, if - * not specified, is to use case-sensitive matches, i.e. suffix '.JPG' - * does not match '.jpg'. + * signify the mode this plugin operates in. An optional single 'I' can be + * appended, to signify that suffix matches should be case-insensitive. The + * default, if not specified, is to use case-sensitive matches, i.e. suffix + * '.JPG' does not match '.jpg'. * </p> * <p> * NOTE: the format of this file is different from urlfilter-prefix, because @@ -82,8 +83,8 @@ import java.net.MalformedURLException; * <h4>Example 1</h4> * <p> * The configuration shown below will accept all URLs with '.html' or '.htm' - * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), - * and prohibit all other suffixes. + * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit + * all other suffixes. * <p> * * <pre> @@ -91,7 +92,7 @@ import java.net.MalformedURLException; * * # prohibit all unknown, case-sensitive matching * - - * + * * # collect only HTML files. * .html * .htm @@ -119,11 +120,13 @@ import java.net.MalformedURLException; * </pre> * * </p> + * * @author Andrzej Bialecki */ public class SuffixURLFilter implements URLFilter { - private static final Logger LOG = LoggerFactory.getLogger(SuffixURLFilter.class); + private static final Logger LOG = LoggerFactory + .getLogger(SuffixURLFilter.class); // read in attribute "file" of this plugin. private String attributeFile = null; @@ -144,11 +147,13 @@ public class SuffixURLFilter implements } public String filter(String url) { - if (url == null) return null; + if (url == null) + return null; String _url; if (ignoreCase) _url = url.toLowerCase(); - else _url = url; + else + _url = url; if (filterFromPath) { try { URL pUrl = new URL(_url); @@ -160,11 +165,15 @@ public class SuffixURLFilter implements String a = suffixes.shortestMatch(_url); if (a == null) { - if (modeAccept) return url; - else return null; + if (modeAccept) + return url; + else + return null; } else { - if (modeAccept) return null; - else return url; + if (modeAccept) + return null; + else + return url; } } @@ -187,30 +196,31 @@ public class SuffixURLFilter implements String line; while ((line = in.readLine()) != null) { - if (line.length() == 0) continue; + if (line.length() == 0) + continue; char first = line.charAt(0); switch (first) { - case ' ': - case '\n': - case '#': // skip blank & comment lines - break; - case '-': - allow = false; - if(line.contains("P")) - filterFromPath = true; - if(line.contains("I")) - ignore = true; - break; - case '+': - allow = true; - if(line.contains("P")) - filterFromPath = true; - if(line.contains("I")) - ignore = true; - break; - default: - aSuffixes.add(line); + case ' ': + case '\n': + case '#': // skip blank & comment lines + break; + case '-': + allow = false; + if (line.contains("P")) + filterFromPath = true; + if (line.contains("I")) + ignore = true; + break; + case '+': + allow = true; + if (line.contains("P")) + filterFromPath = true; + if (line.contains("I")) + ignore = true; + break; + default: + aSuffixes.add(line); } } if (ignore) { @@ -249,7 +259,8 @@ public class SuffixURLFilter implements this.conf = conf; String pluginName = "urlfilter-suffix"; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(URLFilter.class.getName()).getExtensions(); + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; if (extension.getDescriptor().getPluginId().equals(pluginName)) { @@ -257,22 +268,25 @@ public class SuffixURLFilter implements break; } } - if (attributeFile != null && attributeFile.trim().equals("")) attributeFile = null; + if (attributeFile != null && attributeFile.trim().equals("")) + attributeFile = null; if (attributeFile != null) { if (LOG.isInfoEnabled()) { - LOG.info("Attribute \"file\" is defined for plugin " + pluginName + " as " + attributeFile); + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); } } else { // if (LOG.isWarnEnabled()) { - // LOG.warn("Attribute \"file\" is not defined in plugin.xml for - // plugin "+pluginName); + // LOG.warn("Attribute \"file\" is not defined in plugin.xml for + // plugin "+pluginName); // } } String file = conf.get("urlfilter.suffix.file"); String stringRules = conf.get("urlfilter.suffix.rules"); // attribute "file" takes precedence if defined - if (attributeFile != null) file = attributeFile; + if (attributeFile != null) + file = attributeFile; Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); @@ -283,7 +297,9 @@ public class SuffixURLFilter implements try { readConfiguration(reader); } catch (IOException e) { - if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } throw new RuntimeException(e.getMessage(), e); } } Modified: nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java Thu Jan 29 05:38:59 2015 @@ -25,101 +25,45 @@ import org.junit.Test; /** * JUnit test for <code>SuffixURLFilter</code>. - * + * * @author Andrzej Bialecki */ public class TestSuffixURLFilter { - private static final String suffixes = - "# this is a comment\n" + - "\n" + - ".gif\n" + - ".jpg\n" + - ".js\n"; - + private static final String suffixes = "# this is a comment\n" + "\n" + + ".gif\n" + ".jpg\n" + ".js\n"; + private static final String[] urls = new String[] { - "http://www.example.com/test.gif", - "http://www.example.com/TEST.GIF", - "http://www.example.com/test.jpg", - "http://www.example.com/test.JPG", - "http://www.example.com/test.html", - "http://www.example.com/test.HTML", - "http://www.example.com/test.html?q=abc.js", - "http://www.example.com/test.js?foo=bar&baz=bar#12333", - }; - - private static String[] urlsModeAccept = new String[] { - null, - urls[1], - null, - urls[3], - urls[4], - urls[5], - null, - urls[7] - }; - - private static String[] urlsModeReject = new String[] { - urls[0], - null, - urls[2], - null, - null, - null, - urls[6], - null - }; - - private static String[] urlsModeAcceptIgnoreCase = new String[] { - null, - null, - null, - null, - urls[4], - urls[5], - null, - urls[7] - }; - - private static String[] urlsModeRejectIgnoreCase = new String[] { - urls[0], - urls[1], - urls[2], - urls[3], - null, - null, - urls[6], - null - }; - - private static String[] urlsModeAcceptAndPathFilter = new String[] { - null, - urls[1], - null, - urls[3], - urls[4], - urls[5], - urls[6], - null - }; - - private static String[] urlsModeAcceptAndNonPathFilter = new String[] { - null, - urls[1], - null, - urls[3], - urls[4], - urls[5], - null, - urls[7] - }; - + "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF", + "http://www.example.com/test.jpg", "http://www.example.com/test.JPG", + "http://www.example.com/test.html", "http://www.example.com/test.HTML", + "http://www.example.com/test.html?q=abc.js", + "http://www.example.com/test.js?foo=bar&baz=bar#12333", }; + + private static String[] urlsModeAccept = new String[] { null, urls[1], null, + urls[3], urls[4], urls[5], null, urls[7] }; + + private static String[] urlsModeReject = new String[] { urls[0], null, + urls[2], null, null, null, urls[6], null }; + + private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null, + null, null, urls[4], urls[5], null, urls[7] }; + + private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0], + urls[1], urls[2], urls[3], null, null, urls[6], null }; + + private static String[] urlsModeAcceptAndPathFilter = new String[] { null, + urls[1], null, urls[3], urls[4], urls[5], urls[6], null }; + + private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null, + urls[1], null, urls[3], urls[4], urls[5], null, urls[7] }; + private SuffixURLFilter filter = null; - + @Before public void setUp() throws IOException { filter = new SuffixURLFilter(new StringReader(suffixes)); } - + @Test public void testModeAccept() { filter.setIgnoreCase(false); @@ -155,22 +99,24 @@ public class TestSuffixURLFilter { Assert.assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i])); } } - + @Test public void testModeAcceptAndNonPathFilter() { filter.setModeAccept(true); filter.setFilterFromPath(false); for (int i = 0; i < urls.length; i++) { - Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter.filter(urls[i])); + Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter + .filter(urls[i])); } } - + @Test public void testModeAcceptAndPathFilter() { filter.setModeAccept(true); filter.setFilterFromPath(true); for (int i = 0; i < urls.length; i++) { - Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter.filter(urls[i])); + Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter + .filter(urls[i])); } } Modified: nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java (original) +++ nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java Thu Jan 29 05:38:59 2015 @@ -23,12 +23,16 @@ import org.apache.hadoop.conf.Configurat import org.apache.nutch.net.URLFilter; /** - * <p>Validates URLs.</p> - * - * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, - * Date: 03/07/02, - * http://javascript.internet.com. However, this validation now bears little - * resemblance to the php original.</p> + * <p> + * Validates URLs. + * </p> + * + * <p> + * Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: + * 03/07/02, http://javascript.internet.com. However, this validation now bears + * little resemblance to the php original. + * </p> + * * <pre> * Example of usage: * UrlValidator urlValidator = UrlValidator.get(); @@ -37,17 +41,17 @@ import org.apache.nutch.net.URLFilter; * } else { * System.out.println("url is invalid"); * } - * + * * prints out "url is valid" - * </pre> - * - * <p>Based on UrlValidator code from Apache commons-validator.</p> - * - * @see - * <a href='http://www.ietf.org/rfc/rfc2396.txt' > - * Uniform Resource Identifiers (URI): Generic Syntax - * </a> - * + * </pre> + * + * <p> + * Based on UrlValidator code from Apache commons-validator. + * </p> + * + * @see <a href='http://www.ietf.org/rfc/rfc2396.txt' > Uniform Resource + * Identifiers (URI): Generic Syntax </a> + * */ public class UrlValidator implements URLFilter { @@ -61,7 +65,7 @@ public class UrlValidator implements URL private static final String SCHEME_CHARS = ALPHA_CHARS; - // Drop numeric, and "+-." for now + // Drop numeric, and "+-." for now private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\."; private static final String ATOM = VALID_CHARS + '+'; @@ -69,9 +73,9 @@ public class UrlValidator implements URL /** * This expression derived/taken from the BNF for URI (RFC2396). */ - private static final Pattern URL_PATTERN = - Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)" + - "(\\?([^#]*))?(#(.*))?"); + private static final Pattern URL_PATTERN = Pattern + .compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)" + + "(\\?([^#]*))?(#(.*))?"); /** * Schema/Protocol (ie. http:, ftp:, file:, etc). @@ -90,11 +94,11 @@ public class UrlValidator implements URL /** * Protocol (ie. http:, ftp:,https:). */ - private static final Pattern SCHEME_PATTERN = - Pattern.compile("^[" + SCHEME_CHARS + "]+"); + private static final Pattern SCHEME_PATTERN = Pattern.compile("^[" + + SCHEME_CHARS + "]+"); - private static final Pattern AUTHORITY_PATTERN = - Pattern.compile("^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?"); + private static final Pattern AUTHORITY_PATTERN = Pattern.compile("^([" + + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?"); private static final int PARSE_AUTHORITY_HOST_IP = 1; @@ -105,28 +109,26 @@ public class UrlValidator implements URL */ private static final int PARSE_AUTHORITY_EXTRA = 3; - private static final Pattern PATH_PATTERN = - Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"); + private static final Pattern PATH_PATTERN = Pattern + .compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"); private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$"); - private static final Pattern LEGAL_ASCII_PATTERN = - Pattern.compile("^[\\x21-\\x7E]+$"); + private static final Pattern LEGAL_ASCII_PATTERN = Pattern + .compile("^[\\x21-\\x7E]+$"); - private static final Pattern IP_V4_DOMAIN_PATTERN = - Pattern.compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$"); + private static final Pattern IP_V4_DOMAIN_PATTERN = Pattern + .compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$"); - private static final Pattern DOMAIN_PATTERN = - Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$"); + private static final Pattern DOMAIN_PATTERN = Pattern.compile("^" + ATOM + + "(\\." + ATOM + ")*$"); - private static final Pattern PORT_PATTERN = - Pattern.compile("^:(\\d{1,5})$"); + private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$"); - private static final Pattern ATOM_PATTERN = - Pattern.compile("(" + ATOM + ")"); + private static final Pattern ATOM_PATTERN = Pattern.compile("(" + ATOM + ")"); - private static final Pattern ALPHA_PATTERN = - Pattern.compile("^[" + ALPHA_CHARS + "]"); + private static final Pattern ALPHA_PATTERN = Pattern.compile("^[" + + ALPHA_CHARS + "]"); private Configuration conf; @@ -143,10 +145,13 @@ public class UrlValidator implements URL } /** - * <p>Checks if a field has a valid url address.</p> - * - * @param value The value validation is being performed on. - * A <code>null</code> value is considered invalid. + * <p> + * Checks if a field has a valid url address. + * </p> + * + * @param value + * The value validation is being performed on. A <code>null</code> + * value is considered invalid. * @return true if the url is valid. */ private boolean isValid(String value) { @@ -184,11 +189,13 @@ public class UrlValidator implements URL } /** - * Validate scheme. If schemes[] was initialized to a non null, - * then only those scheme's are allowed. Note this is slightly different - * than for the constructor. - * @param scheme The scheme to validate. A <code>null</code> value is - * considered invalid. + * Validate scheme. If schemes[] was initialized to a non null, then only + * those scheme's are allowed. Note this is slightly different than for the + * constructor. + * + * @param scheme + * The scheme to validate. A <code>null</code> value is considered + * invalid. * @return true if valid. */ private boolean isValidScheme(String scheme) { @@ -200,10 +207,12 @@ public class UrlValidator implements URL } /** - * Returns true if the authority is properly formatted. An authority is - * the combination of hostname and port. A <code>null</code> authority - * value is considered invalid. - * @param authority Authority value to validate. + * Returns true if the authority is properly formatted. An authority is the + * combination of hostname and port. A <code>null</code> authority value is + * considered invalid. + * + * @param authority + * Authority value to validate. * @return true if authority (hostname and port) is valid. */ private boolean isValidAuthority(String authority) { @@ -235,7 +244,7 @@ public class UrlValidator implements URL if (Integer.parseInt(ipSegment) > 255) { return false; } - } catch(NumberFormatException e) { + } catch (NumberFormatException e) { return false; } @@ -251,8 +260,8 @@ public class UrlValidator implements URL // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203 char[] chars = hostIP.toCharArray(); int size = 1; - for(int i=0; i<chars.length; i++) { - if(chars[i] == '.') { + for (int i = 0; i < chars.length; i++) { + if (chars[i] == '.') { size++; } } @@ -264,8 +273,7 @@ public class UrlValidator implements URL while (atomMatcher.find()) { domainSegment[segCount] = atomMatcher.group(); segLen = domainSegment[segCount].length() + 1; - hostIP = (segLen >= hostIP.length()) ? "" - : hostIP.substring(segLen); + hostIP = (segLen >= hostIP.length()) ? "" : hostIP.substring(segLen); segCount++; } String topLevel = domainSegment[segCount - 1]; @@ -300,10 +308,13 @@ public class UrlValidator implements URL } /** - * <p>Checks if the field isn't null and length of the field is greater - * than zero not including whitespace.</p> - * - * @param value The value validation is being performed on. + * <p> + * Checks if the field isn't null and length of the field is greater than zero + * not including whitespace. + * </p> + * + * @param value + * The value validation is being performed on. * @return true if blank or null. */ private boolean isBlankOrNull(String value) { @@ -311,9 +322,11 @@ public class UrlValidator implements URL } /** - * Returns true if the path is valid. A <code>null</code> value is - * considered invalid. - * @param path Path value to validate. + * Returns true if the path is valid. A <code>null</code> value is considered + * invalid. + * + * @param path + * Path value to validate. * @return true if path is valid. */ private boolean isValidPath(String path) { @@ -335,7 +348,9 @@ public class UrlValidator implements URL /** * Returns true if the query is null or it's a properly formatted query * string. - * @param query Query value to validate. + * + * @param query + * Query value to validate. * @return true if query is valid. */ private boolean isValidQuery(String query) { @@ -348,8 +363,11 @@ public class UrlValidator implements URL /** * Returns the number of times the token appears in the target. - * @param token Token value to be counted. - * @param target Target value to count tokens in. + * + * @param token + * Token value to be counted. + * @param target + * Target value to count tokens in. * @return the number of tokens. */ private int countToken(String token, String target) { Modified: nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java (original) +++ nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java Thu Jan 29 05:38:59 2015 @@ -21,40 +21,59 @@ import org.junit.Assert; import org.junit.Test; /** - * JUnit test case which tests - * 1. that valid urls are not filtered while invalid ones are filtered. - * 2. that Urls' scheme, authority, path and query are validated. + * JUnit test case which tests 1. that valid urls are not filtered while invalid + * ones are filtered. 2. that Urls' scheme, authority, path and query are + * validated. * * @author tejasp - * + * */ public class TestUrlValidator { /** - * Test method for {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}. + * Test method for + * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)} + * . */ @Test public void testFilter() { UrlValidator url_validator = new UrlValidator(); Assert.assertNotNull(url_validator); - Assert.assertNull("Filtering on a null object should return null", url_validator.filter(null)); - Assert.assertNull("Invalid url: example.com/file[/].html", url_validator.filter("example.com/file[/].html")); - Assert.assertNull("Invalid url: http://www.example.com/space here.html", url_validator.filter("http://www.example.com/space here.html")); - Assert.assertNull("Invalid url: /main.html", url_validator.filter("/main.html")); - Assert.assertNull("Invalid url: www.example.com/main.html", url_validator.filter("www.example.com/main.html")); - Assert.assertNull("Invalid url: ftp:www.example.com/main.html", url_validator.filter("ftp:www.example.com/main.html")); - Assert.assertNull("Inalid url: http://999.000.456.32/nutch/trunk/README.txt", + Assert.assertNull("Filtering on a null object should return null", + url_validator.filter(null)); + Assert.assertNull("Invalid url: example.com/file[/].html", + url_validator.filter("example.com/file[/].html")); + Assert.assertNull("Invalid url: http://www.example.com/space here.html", + url_validator.filter("http://www.example.com/space here.html")); + Assert.assertNull("Invalid url: /main.html", + url_validator.filter("/main.html")); + Assert.assertNull("Invalid url: www.example.com/main.html", + url_validator.filter("www.example.com/main.html")); + Assert.assertNull("Invalid url: ftp:www.example.com/main.html", + url_validator.filter("ftp:www.example.com/main.html")); + Assert.assertNull( + "Inalid url: http://999.000.456.32/nutch/trunk/README.txt", url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt")); - Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html", url_validator.filter(" http://www.example.com/ma|in\\toc.html")); + Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html", + url_validator.filter(" http://www.example.com/ma|in\\toc.html")); - Assert.assertNotNull("Valid url: https://issues.apache.org/jira/NUTCH-1127", url_validator.filter("https://issues.apache.org/jira/NUTCH-1127")); - Assert.assertNotNull("Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather", - url_validator.filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather")); - Assert.assertNotNull("Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress", - url_validator.filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress")); - Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf", url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf")); + Assert.assertNotNull( + "Valid url: https://issues.apache.org/jira/NUTCH-1127", + url_validator.filter("https://issues.apache.org/jira/NUTCH-1127")); + Assert + .assertNotNull( + "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather", + url_validator + .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather")); + Assert + .assertNotNull( + "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress", + url_validator + .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress")); + Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf", + url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf")); } }
