[67/69] [abbrv] nutch git commit: Moved test resources to maven's test resources directory

thammegowda Tue, 05 Jul 2016 15:51:00 -0700

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls 
b/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls
deleted file mode 100644
index 40bf4ee..0000000
--- a/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls
+++ /dev/null
@@ -1,297 +0,0 @@
-+http://www.hostip.info/
--http://www.elanceur.org/Articles/OntologieSurfaite.html
-+http://www.opensymphony.com/quartz/
--http://www.portletbridge.org/saxbenchmark/index.html
-+http://www.lesmotsdelinfo.com/
-+http://usefulinc.com/doap/
-+http://www.codezoo.com/
-+http://search.infocious.com/
--http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
-+http://www.brics.dk/%7Eamoeller/automaton/
-+http://jazzz.com/wp.html
-+http://www.maxkiesler.com/index.php
-+http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
-+http://www.alias-i.com/lingpipe/
--http://johnny.ihackstuff.com/index.php?module=prodreviews
--http://www.spurl.net/
-+http://www.dropload.com/
-+http://vivisimo.com/
-+http://www.marumushi.com/apps/newsmap/newsmap.cfm
-+http://www.ixquick.com/
--http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
-+http://www.mail-archive.com/
-+http://www.spymac.com/
--http://browsers.evolt.org/
--http://www.oswd.org/
-+http://www.stayinvisible.com/index.pl
-+http://java.sun.com/j2se/1.4.2/docs/api/index.html
-+http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
-+http://www.bloglines.com/
--http://www.fckeditor.net/
-+http://search.msn.com/
--http://www.grub.org/
-+http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
--http://www.mnot.net/cache_docs/
--http://www.furl.net/
-+http://www.blogpulse.com/
-+http://www.googlefight.com/
-+http://www.rokulabs.com/
--http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
--http://www.batbox.org/wrt54g-linux.html
--http://en.wikipedia.org/wiki/%s
-+http://www.sipcenter.com/
-+http://www.merriampark.com/ld.htm
-+http://anon.inf.tu-dresden.de/index_en.html
-+http://www.pluck.com/
-+http://www.tiddlywiki.com/
-+http://www.jux2.com/
-+http://clusty.com/
--http://findability.org/
-+http://www.searchengineshowdown.com/
-+http://www.nhacks.com/email/index.php
-+http://www.koders.com/
-+http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
-+http://www.gmailwiki.com/index.php/Main_Page
-+http://www.tadalist.com/
-+http://www.net2ftp.com/
-+http://www.streamload.com/
-+http://www.lucazappa.com/brilliantMaker/buttonImage.php
-+http://www.hybernaut.com/bdv/delicious-import.html
-+http://www.gtmcknight.com/buttons/
-+http://amb.vis.ne.jp/mozilla/scrapbook/
-+http://g-metrics.com/index.php
--http://tor.eff.org/
-+http://www.search-this.com/search_engine_decoder.asp
-+http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
-+http://www.adaptivepath.com/publications/essays/archives/000385.php
--http://isnoop.net/gmail/
--http://openweb.eu.org/
-+http://www.mistergooddeal.com/
-+http://javatoolbox.com/
--http://www.freenews.fr/
-+http://www.wikiwax.com/
--http://today.java.net/pub/a/today/2005/04/21/farm.html
-+http://users.skynet.be/J.Beever/pave.htm
-+http://www.lundi8h.com/
-+http://www.snap.com/
-+http://www.goosee.com/puppy/index.shtml
--http://www.softwarefreedom.org/index.html
--http://y.20q.net/
-+http://www.bitty.com/
-+http://www.lafraise.com/
--http://www.liquidinformation.org/
-+http://www.searchtools.com/
-+http://www.martinfowler.com/articles/injection.html
-+http://pdos.csail.mit.edu/scigen/
--http://developer.yahoo.net/blog/
-+http://blogger-templates.blogspot.com/
-+http://phpadsnew.com/two/
-+http://www.langreiter.com/exec/yahoo-vs-google.html
--http://www.dataparksearch.org/
--http://www.yubnub.org/
--http://www.fing.org/
--http://www.swish-e.org/
--http://www.openajax.net/wordpress/
-+http://crypto.stanford.edu/PwdHash/
-+http://www.html-kit.com/favicon/
--http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
-+http://www.durhamtownship.com/
-+http://jiwire.com/
-+http://www.insilmaril.de/vym/
--http://www.spreadshirt.net/
-+http://www.goffice.com/
-+http://www.writely.com/
-+http://www.milindparikh.com/
-+http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
-+http://www.wikyblog.com/Map/Guest/Home
--http://www.kottke.org/05/08/googleos-webos
-+http://www.rollyo.com/
-+http://www.meebo.com/
-+http://www.factbites.com/
-+http://www.placeopedia.com/
-+http://swoogle.umbc.edu/
-+http://www.viaduc.com/
--http://demo.wikiwyg.net/wikiwyg/demo/standalone/
-+http://podcasts.yahoo.com/
--http://beaglewiki.org/Main_Page
-+http://yq.search.yahoo.com/
--http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
-+http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
-+http://socialight.com/
-+http://www.lexxe.com/
-+http://www.xom.nu/
-+http://www.turboprint.de/
-+http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
-+http://www.wi-fiplanet.com/tutorials/article.php/3562391
-+http://particletree.com/features/10-tips-to-a-better-form/
-+http://www.songbirdnest.com/
--http://www.w3.org/Talks/Tools/Slidy/
--http://www.compassframework.org/display/SITE/Home
-+http://motrech.blogspot.com/
-+http://www.moteurzine.com/
-+http://www.mex-search.com/
--http://beta.previewseek.com/?mdc=y&amp;twin=n&amp;ilang=french
-+http://www.goshme.com/
-+http://rialto.application-servers.com/
-+http://www.multe-pass.com/
-+http://www.tailrank.com/
-+http://www.vandertramp.com/INTERNETDOWN/
-+http://www.letterjames.de/index.html
-+http://code.google.com/index.html
-+http://www.kritx.com/
-+http://performancing.com/firefox
-+http://www.mywebsearch.com/
--http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
-+http://www.lukew.com/resources/articles/blogs2.asp
--http://www.hyperwords.net/
-+http://ajax.parish.ath.cx/translator/
-+http://www.maplandia.com/
--http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
-+http://onefeed.com/index.php
-+http://www.file-swap.com/
--http://opennlp.org/
-+http://mindprod.com/jgloss/encoding.html
-+http://code.google.com/webstats/index.html
-+http://www.freeweb-hosting.com/google_pagerank_pr_checker/
--http://www.framakey.org/
--http://microformats.org/wiki/hreview
--http://www.ashesandsnow.org/index2.html
--http://uima-framework.sourceforge.net/
-+http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
--http://www.anandtech.com/IT/showdoc.aspx?i=2523&amp;p=2
-+http://fr.techcrunch.com/
--http://developer.yahoo.net/yui/
-+http://www.fredrikodman.com/
-+http://www.mpirical.com/companion/mpirical_companion.html
-+http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
--http://k9copy.free.fr/
--http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
--http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
--http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
-+http://blogokat.canalblog.com/archives/2005/11/02/882454.html
-+http://robur.slu.se/jensl/xmlclitools/
--http://www.internetactu.net/?p=6291
--http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
-+http://www.memodata.com/2004/fr/alexandria/
--http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
-+http://www.randomerror.com/
-+http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
--http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
--http://interstices.info/display.jsp?id=c_15918
-+http://www.tech-invite.com/
-+http://www.croczilla.com/zap
--http://www.libervis.com/modules/wordpress/?p=13
-+http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
--http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
-+http://www.influo.com/
-+http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
--http://www.addnb.org/fr/docs/webinvisible.htm
--http://manhack.net/
--http://www.jibaku.net/
-+http://www.pipologie.com/
-+http://christophenoel.blogspot.com/
--http://www.seekport.fr/seekbot/
-+http://beta.exalead.com/
--http://www.boolgum.fr/index.html
-+http://www.kesako.canalblog.com/
-+http://loran.blogspot.com/
-+http://outils-recherche.blogspot.com/
-+http://www.art-dept.com/artists/giacobbe/
-+http://www.meggould.netfirms.com/site_seeingIII.htm
-+http://www.freedpi.com/
-+http://www.frenchfred.com/
-+http://www.photoways.com/
--http://freco.free.fr/index.htm
--http://triturages.free.fr/index.htm
--http://www.qsos.org/
-+http://www.alvis.info/alvis/
-+http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
--http://www.shinux.org/
-+http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
-+http://www.kurobox.com/online/tiki-index.php
--http://news.gmane.org/gmane.comp.misc.linkstation.linux
-+http://www.imsbook.com/SIP-IMS-Standards-List.html
--http://incubator.apache.org/directory/subprojects/snickers/
--http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
--http://sourceforge.net/projects/cryptix-asn1/
--http://sourceforge.net/projects/basn/
--http://asn1.elibel.tm.fr/fr/index.htm
--http://sourceforge.net/projects/a2j/
-+http://www.degrouptest.com/
-+http://interstices.info/
-+http://louvre-boite.viabloga.com/news/18.shtml
--http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
-+http://poiplace.oabsoftware.nl/
--http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
--http://www.yoono.com/favorites.jsp?user-id=lquerel
--http://www.librecours.org/cgi-bin/main
--http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
--http://limo.sourceforge.net/
-+http://www-scf.usc.edu/%7Emattmann/
-+http://spaces.msn.com/members/famillezen/
--http://photos.joune.org/
--http://www.canon.fr/paperart/
-+http://flash.eastweb.ru/files/20051024092150.swf
-+http://www.xsltwiki.com/index.php/Main_Page
-+http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
--http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
-+http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
--http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
-+http://www.aeliosfinance.com/
-+http://www.capital-it.com/
--http://www.tradedoubler.fr/pan/public/solutions/publisher
--http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
-+http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
-+http://wanabo.com/
--http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
--http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
-+http://aeliosfinance.com/
-+http://www.centreincubation.com/
-+http://www.franceincubation.com/
--http://www.oseo.fr/
-+http://www.i18nfaq.com/chardet.html
--http://cpdetector.sourceforge.net/
-+http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
-+http://chezlorry.ca/Accueil.htm
-+http://cetnia.blogs.com/d_lires/
--http://www.directwine.fr/
-+http://www.new-phenix.com/
--http://upnp.sourceforge.net/
--http://www.pixmania.fr/
--http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
-+http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
-+http://www.stepnewz.com/sn/default.asp
-+http://opquast.com/
--http://www.freeplayer.org/
--http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
--http://atomcomputer.free.fr/fbox/
--http://www.internetactu.net/index.php?p=6100
--http://mammouthland.free.fr/cours/css/genecss.php
--http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
-+http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
--http://xml.apache.org/xalan-j/extensions.html
-+http://developers.sun.com/foryourbusiness/jcc/
-+http://blogs.sun.com/roller/page/roumen/Weblog
--http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
--http://blog.developpez.com/index.php?blog=51&amp;p=1389&amp;more=1&amp;c=1&amp;tb=1&amp;pb=1
-+http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
-+http://odur.let.rug.nl/%7Evannoord/
--http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
--http://artist.inist.fr/
-+http://www.elra.info/
--http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
-+http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
-+http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
-+http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
-+http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
-+http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
-+http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
-+http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
-+http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
--http://www.lexique.org/
-+http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
-+http://www.streamium.com/products/mx6000i/
--http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
--http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
-+http://www.tversity.com/
--http://www.aspseek.org/index.php
\ No newline at end of file


http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules 
b/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules
deleted file mode 100644
index 705bdb2..0000000
--- a/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules
+++ /dev/null
@@ -1,27 +0,0 @@
-# The url filter file used by the crawl command.
-
-# Better for intranet crawling.
-# Be sure to change MY.DOMAIN.NAME to your domain name.
-
-# Each non-comment, non-blank line contains a regular expression
-# prefixed by '+' or '-'.  The first matching pattern in the file
-# determines whether a URL is included or ignored.  If no pattern
-# matches, the URL is ignored.
-
-# skip file:, ftp:, & mailto: urls
--^(file|ftp|mailto):
-
-# skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
-
-# skip URLs containing certain characters as probable queries, etc.
--[?*!@=]
-
-# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
--.*(/.+?)/.*?\1/.*?\1/
-
-# accept hosts in MY.DOMAIN.NAME
-+^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
-
-# skip everything else
--.

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls 
b/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls
deleted file mode 100644
index b1ad9b7..0000000
--- a/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls
+++ /dev/null
@@ -1,8 +0,0 @@
--file://home/jc/nutch/index.html
--ftp://ftp.apache.org/nutch.html
--mailto:[email protected]
--news://any.news.server/comp.lang.java
--whois:/nutch.org
-+http://MY.DOMAIN.NAME/
-+http://MY.DOMAIN.NAME/nutch
-+http://www.MY.DOMAIN.NAME/

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules 
b/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules
deleted file mode 100644
index 8778921..0000000
--- a/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules
+++ /dev/null
@@ -1,22 +0,0 @@
-# The default url filter.
-# Better for whole-internet crawling.
-
-# Each non-comment, non-blank line contains a regular expression
-# prefixed by '+' or '-'.  The first matching pattern in the file
-# determines whether a URL is included or ignored.  If no pattern
-# matches, the URL is ignored.
-
-# skip file: ftp: and mailto: urls
--^(file|ftp|mailto):
-
-# skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$
-
-# skip URLs containing certain characters as probable queries, etc.
--[?*!@=]
-
-# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
--.*(/.+?)/.*?\1/.*?\1/
-
-# accept anything else
-+.

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls 
b/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls
deleted file mode 100644
index ccb6269..0000000
--- a/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls
+++ /dev/null
@@ -1,11 +0,0 @@
--file://home/jc/nutch/index.html
--ftp://ftp.apache.org/nutch.html
--mailto:[email protected]
-+news://any.news.server/comp.lang.java
-+whois:/nutch.org
--http://www.nutch.org/nutch.gif
--http://www.nutch.org/nutch.eps
--http://www.nutch.org/nutch?q=nutch
-+http://www.nutch.org/
--http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
--http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/nutch1838.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/nutch1838.rules 
b/nutch-plugins/urlfilter-regex/sample/nutch1838.rules
deleted file mode 100644
index f7b0d13..0000000
--- a/nutch-plugins/urlfilter-regex/sample/nutch1838.rules
+++ /dev/null
@@ -1,12 +0,0 @@
-# Skip all url's containing skip for example.org
-> www.example.org
--skip
-<
-
-# Allow all url's containing skip for example.com
-> www.example.com
-+skip
-<
-
-# Skip everything else
--.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/nutch1838.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/nutch1838.urls 
b/nutch-plugins/urlfilter-regex/sample/nutch1838.urls
deleted file mode 100644
index c6f29d1..0000000
--- a/nutch-plugins/urlfilter-regex/sample/nutch1838.urls
+++ /dev/null
@@ -1,3 +0,0 @@
--http://www.example.org/skip-me-now
-+http://www.example.com/noone-can-skip-me
--http://www.example.nl/i-am-filtered
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules 
b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules
new file mode 100644
index 0000000..c8901e2
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules
@@ -0,0 +1,26 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# skip .fr .org and .net domains
+-^.*//.*\.fr/
+-^.*//.*\.org/
+-^.*//.*\.net/
+
+# skip everything else
++.

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls 
b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls
new file mode 100644
index 0000000..40bf4ee
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls
@@ -0,0 +1,297 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http://johnny.ihackstuff.com/index.php?module=prodreviews
+-http://www.spurl.net/
++http://www.dropload.com/
++http://vivisimo.com/
++http://www.marumushi.com/apps/newsmap/newsmap.cfm
++http://www.ixquick.com/
+-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
++http://www.mail-archive.com/
++http://www.spymac.com/
+-http://browsers.evolt.org/
+-http://www.oswd.org/
++http://www.stayinvisible.com/index.pl
++http://java.sun.com/j2se/1.4.2/docs/api/index.html
++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
++http://www.bloglines.com/
+-http://www.fckeditor.net/
++http://search.msn.com/
+-http://www.grub.org/
++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
+-http://www.mnot.net/cache_docs/
+-http://www.furl.net/
++http://www.blogpulse.com/
++http://www.googlefight.com/
++http://www.rokulabs.com/
+-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
+-http://www.batbox.org/wrt54g-linux.html
+-http://en.wikipedia.org/wiki/%s
++http://www.sipcenter.com/
++http://www.merriampark.com/ld.htm
++http://anon.inf.tu-dresden.de/index_en.html
++http://www.pluck.com/
++http://www.tiddlywiki.com/
++http://www.jux2.com/
++http://clusty.com/
+-http://findability.org/
++http://www.searchengineshowdown.com/
++http://www.nhacks.com/email/index.php
++http://www.koders.com/
++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
++http://www.gmailwiki.com/index.php/Main_Page
++http://www.tadalist.com/
++http://www.net2ftp.com/
++http://www.streamload.com/
++http://www.lucazappa.com/brilliantMaker/buttonImage.php
++http://www.hybernaut.com/bdv/delicious-import.html
++http://www.gtmcknight.com/buttons/
++http://amb.vis.ne.jp/mozilla/scrapbook/
++http://g-metrics.com/index.php
+-http://tor.eff.org/
++http://www.search-this.com/search_engine_decoder.asp
++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
++http://www.adaptivepath.com/publications/essays/archives/000385.php
+-http://isnoop.net/gmail/
+-http://openweb.eu.org/
++http://www.mistergooddeal.com/
++http://javatoolbox.com/
+-http://www.freenews.fr/
++http://www.wikiwax.com/
+-http://today.java.net/pub/a/today/2005/04/21/farm.html
++http://users.skynet.be/J.Beever/pave.htm
++http://www.lundi8h.com/
++http://www.snap.com/
++http://www.goosee.com/puppy/index.shtml
+-http://www.softwarefreedom.org/index.html
+-http://y.20q.net/
++http://www.bitty.com/
++http://www.lafraise.com/
+-http://www.liquidinformation.org/
++http://www.searchtools.com/
++http://www.martinfowler.com/articles/injection.html
++http://pdos.csail.mit.edu/scigen/
+-http://developer.yahoo.net/blog/
++http://blogger-templates.blogspot.com/
++http://phpadsnew.com/two/
++http://www.langreiter.com/exec/yahoo-vs-google.html
+-http://www.dataparksearch.org/
+-http://www.yubnub.org/
+-http://www.fing.org/
+-http://www.swish-e.org/
+-http://www.openajax.net/wordpress/
++http://crypto.stanford.edu/PwdHash/
++http://www.html-kit.com/favicon/
+-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
++http://www.durhamtownship.com/
++http://jiwire.com/
++http://www.insilmaril.de/vym/
+-http://www.spreadshirt.net/
++http://www.goffice.com/
++http://www.writely.com/
++http://www.milindparikh.com/
++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
++http://www.wikyblog.com/Map/Guest/Home
+-http://www.kottke.org/05/08/googleos-webos
++http://www.rollyo.com/
++http://www.meebo.com/
++http://www.factbites.com/
++http://www.placeopedia.com/
++http://swoogle.umbc.edu/
++http://www.viaduc.com/
+-http://demo.wikiwyg.net/wikiwyg/demo/standalone/
++http://podcasts.yahoo.com/
+-http://beaglewiki.org/Main_Page
++http://yq.search.yahoo.com/
+-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
++http://socialight.com/
++http://www.lexxe.com/
++http://www.xom.nu/
++http://www.turboprint.de/
++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
++http://www.wi-fiplanet.com/tutorials/article.php/3562391
++http://particletree.com/features/10-tips-to-a-better-form/
++http://www.songbirdnest.com/
+-http://www.w3.org/Talks/Tools/Slidy/
+-http://www.compassframework.org/display/SITE/Home
++http://motrech.blogspot.com/
++http://www.moteurzine.com/
++http://www.mex-search.com/
+-http://beta.previewseek.com/?mdc=y&amp;twin=n&amp;ilang=french
++http://www.goshme.com/
++http://rialto.application-servers.com/
++http://www.multe-pass.com/
++http://www.tailrank.com/
++http://www.vandertramp.com/INTERNETDOWN/
++http://www.letterjames.de/index.html
++http://code.google.com/index.html
++http://www.kritx.com/
++http://performancing.com/firefox
++http://www.mywebsearch.com/
+-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
++http://www.lukew.com/resources/articles/blogs2.asp
+-http://www.hyperwords.net/
++http://ajax.parish.ath.cx/translator/
++http://www.maplandia.com/
+-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
++http://onefeed.com/index.php
++http://www.file-swap.com/
+-http://opennlp.org/
++http://mindprod.com/jgloss/encoding.html
++http://code.google.com/webstats/index.html
++http://www.freeweb-hosting.com/google_pagerank_pr_checker/
+-http://www.framakey.org/
+-http://microformats.org/wiki/hreview
+-http://www.ashesandsnow.org/index2.html
+-http://uima-framework.sourceforge.net/
++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
+-http://www.anandtech.com/IT/showdoc.aspx?i=2523&amp;p=2
++http://fr.techcrunch.com/
+-http://developer.yahoo.net/yui/
++http://www.fredrikodman.com/
++http://www.mpirical.com/companion/mpirical_companion.html
++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
+-http://k9copy.free.fr/
+-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
+-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
+-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
++http://blogokat.canalblog.com/archives/2005/11/02/882454.html
++http://robur.slu.se/jensl/xmlclitools/
+-http://www.internetactu.net/?p=6291
+-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
++http://www.memodata.com/2004/fr/alexandria/
+-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
++http://www.randomerror.com/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
+-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
+-http://interstices.info/display.jsp?id=c_15918
++http://www.tech-invite.com/
++http://www.croczilla.com/zap
+-http://www.libervis.com/modules/wordpress/?p=13
++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
+-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
++http://www.influo.com/
++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
+-http://www.addnb.org/fr/docs/webinvisible.htm
+-http://manhack.net/
+-http://www.jibaku.net/
++http://www.pipologie.com/
++http://christophenoel.blogspot.com/
+-http://www.seekport.fr/seekbot/
++http://beta.exalead.com/
+-http://www.boolgum.fr/index.html
++http://www.kesako.canalblog.com/
++http://loran.blogspot.com/
++http://outils-recherche.blogspot.com/
++http://www.art-dept.com/artists/giacobbe/
++http://www.meggould.netfirms.com/site_seeingIII.htm
++http://www.freedpi.com/
++http://www.frenchfred.com/
++http://www.photoways.com/
+-http://freco.free.fr/index.htm
+-http://triturages.free.fr/index.htm
+-http://www.qsos.org/
++http://www.alvis.info/alvis/
++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
+-http://www.shinux.org/
++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
++http://www.kurobox.com/online/tiki-index.php
+-http://news.gmane.org/gmane.comp.misc.linkstation.linux
++http://www.imsbook.com/SIP-IMS-Standards-List.html
+-http://incubator.apache.org/directory/subprojects/snickers/
+-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
+-http://sourceforge.net/projects/cryptix-asn1/
+-http://sourceforge.net/projects/basn/
+-http://asn1.elibel.tm.fr/fr/index.htm
+-http://sourceforge.net/projects/a2j/
++http://www.degrouptest.com/
++http://interstices.info/
++http://louvre-boite.viabloga.com/news/18.shtml
+-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
++http://poiplace.oabsoftware.nl/
+-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
+-http://www.yoono.com/favorites.jsp?user-id=lquerel
+-http://www.librecours.org/cgi-bin/main
+-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
+-http://limo.sourceforge.net/
++http://www-scf.usc.edu/%7Emattmann/
++http://spaces.msn.com/members/famillezen/
+-http://photos.joune.org/
+-http://www.canon.fr/paperart/
++http://flash.eastweb.ru/files/20051024092150.swf
++http://www.xsltwiki.com/index.php/Main_Page
++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
+-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
+-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
++http://www.aeliosfinance.com/
++http://www.capital-it.com/
+-http://www.tradedoubler.fr/pan/public/solutions/publisher
+-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
++http://wanabo.com/
+-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
+-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
++http://aeliosfinance.com/
++http://www.centreincubation.com/
++http://www.franceincubation.com/
+-http://www.oseo.fr/
++http://www.i18nfaq.com/chardet.html
+-http://cpdetector.sourceforge.net/
++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
++http://chezlorry.ca/Accueil.htm
++http://cetnia.blogs.com/d_lires/
+-http://www.directwine.fr/
++http://www.new-phenix.com/
+-http://upnp.sourceforge.net/
+-http://www.pixmania.fr/
+-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
++http://www.stepnewz.com/sn/default.asp
++http://opquast.com/
+-http://www.freeplayer.org/
+-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
+-http://atomcomputer.free.fr/fbox/
+-http://www.internetactu.net/index.php?p=6100
+-http://mammouthland.free.fr/cours/css/genecss.php
+-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
+-http://xml.apache.org/xalan-j/extensions.html
++http://developers.sun.com/foryourbusiness/jcc/
++http://blogs.sun.com/roller/page/roumen/Weblog
+-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
+-http://blog.developpez.com/index.php?blog=51&amp;p=1389&amp;more=1&amp;c=1&amp;tb=1&amp;pb=1
++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
++http://odur.let.rug.nl/%7Evannoord/
+-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
+-http://artist.inist.fr/
++http://www.elra.info/
+-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
+-http://www.lexique.org/
++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
++http://www.streamium.com/products/mx6000i/
+-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
+-http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
++http://www.tversity.com/
+-http://www.aspseek.org/index.php
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules 
b/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules
new file mode 100644
index 0000000..705bdb2
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules
@@ -0,0 +1,27 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
+# accept hosts in MY.DOMAIN.NAME
++^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
+
+# skip everything else
+-.

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.urls
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.urls 
b/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.urls
new file mode 100644
index 0000000..b1ad9b7
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.urls
@@ -0,0 +1,8 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:[email protected]
+-news://any.news.server/comp.lang.java
+-whois:/nutch.org
++http://MY.DOMAIN.NAME/
++http://MY.DOMAIN.NAME/nutch
++http://www.MY.DOMAIN.NAME/

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.rules
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.rules 
b/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.rules
new file mode 100644
index 0000000..8778921
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.rules
@@ -0,0 +1,22 @@
+# The default url filter.
+# Better for whole-internet crawling.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file: ftp: and mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
+# accept anything else
++.

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.urls
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.urls 
b/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.urls
new file mode 100644
index 0000000..ccb6269
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.urls
@@ -0,0 +1,11 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:[email protected]
++news://any.news.server/comp.lang.java
++whois:/nutch.org
+-http://www.nutch.org/nutch.gif
+-http://www.nutch.org/nutch.eps
+-http://www.nutch.org/nutch?q=nutch
++http://www.nutch.org/
+-http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
+-http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.rules 
b/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.rules
new file mode 100644
index 0000000..f7b0d13
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.rules
@@ -0,0 +1,12 @@
+# Skip all url's containing skip for example.org
+> www.example.org
+-skip
+<
+
+# Allow all url's containing skip for example.com
+> www.example.com
++skip
+<
+
+# Skip everything else
+-.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.urls 
b/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.urls
new file mode 100644
index 0000000..c6f29d1
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.urls
@@ -0,0 +1,3 @@
+-http://www.example.org/skip-me-now
++http://www.example.com/noone-can-skip-me
+-http://www.example.nl/i-am-filtered
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-host/data/hosts.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/data/hosts.txt 
b/nutch-plugins/urlnormalizer-host/data/hosts.txt
deleted file mode 100644
index c7e0ccf..0000000
--- a/nutch-plugins/urlnormalizer-host/data/hosts.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-# Force all sub domains to www.
-*.example.com example.com
-
-# Force no sub domain to www. URL's
-www.example.net example.net
-
-# Force www. sub domain when hitting link without sub domain
-example.org www.example.org
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-host/src/test/resources/hosts.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/src/test/resources/hosts.txt 
b/nutch-plugins/urlnormalizer-host/src/test/resources/hosts.txt
new file mode 100644
index 0000000..c7e0ccf
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-host/src/test/resources/hosts.txt
@@ -0,0 +1,8 @@
+# Force all sub domains to www.
+*.example.com example.com
+
+# Force no sub domain to www. URL's
+www.example.net example.net
+
+# Force www. sub domain when hitting link without sub domain
+example.org www.example.org
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-protocol/data/protocols.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-protocol/data/protocols.txt 
b/nutch-plugins/urlnormalizer-protocol/data/protocols.txt
deleted file mode 100644
index 7091cd7..0000000
--- a/nutch-plugins/urlnormalizer-protocol/data/protocols.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# format: host\tprotocol\n
-
-example.org    http
-example.net    http
-
-example.io     https
-example.nl     https

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-protocol/src/test/resources/protocols.txt
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-protocol/src/test/resources/protocols.txt 
b/nutch-plugins/urlnormalizer-protocol/src/test/resources/protocols.txt
new file mode 100644
index 0000000..7091cd7
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-protocol/src/test/resources/protocols.txt
@@ -0,0 +1,7 @@
+# format: host\tprotocol\n
+
+example.org    http
+example.net    http
+
+example.io     https
+example.nl     https

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test 
b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test
deleted file mode 100644
index 7867ad8..0000000
--- a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test
+++ /dev/null
@@ -1,84 +0,0 @@
-# test simple removal of session id, keeping parameters before and after
-http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 
http://foo.com/foo.php
-http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 
http://foo.com/foo.php?f=2
-http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3 
http://foo.com/foo.php?f=2&q=3
-http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2 
http://foo.com/foo.php?f=2
-
-# test removal of different session ids including removal of ; in jsessionid
-http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl 
http://www.foo.com/foo.php
-http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl&x=y 
http://www.foo.com/foo.php?x=y
-http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED 
http://www.foo.com/foo.html
-http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED
 http://www.foo.com/foo.html?param=1&another=2
-http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2
 http://www.foo.com/foo.html?param=1&another=2
-http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 
http://www.foo.com/foo.php?x=1&something=1
-http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 
http://www.foo.com/foo.html
-http://www.foo.com/foo.php?_sessionid=qmyrcedt&outputformat=html&path=/3_images/foo
 http://www.foo.com/foo.php?outputformat=html&path=/3_images/foo
-http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en&_sessionid=e36902d5bb2d0d922fc24b43
 http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en
-http://www.foo.com/foo.php?app=content&content=overview&lang=en&_sid=587fba8f825b05844526519fdb7d75c8&b=35&m=47
 http://www.foo.com/foo.php?app=content&content=overview&lang=en&b=35&m=47
-# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328)
-http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 
http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539
-http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0
 http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0
-
-# test removal default pages
-http://www.foo.com/home/index.html http://www.foo.com/home/
-http://www.foo.com/index.html http://www.foo.com/
-http://www.foo.com/index.htm http://www.foo.com/
-http://www.foo.com/index.asp http://www.foo.com/
-http://www.foo.com/index.aspx http://www.foo.com/
-http://www.foo.com/index.php http://www.foo.com/
-http://www.foo.com/index.php3 http://www.foo.com/
-http://www.foo.com/default.html http://www.foo.com/
-http://www.foo.com/default.htm http://www.foo.com/
-http://www.foo.com/default.asp http://www.foo.com/
-http://www.foo.com/default.aspx http://www.foo.com/
-http://www.foo.com/default.php http://www.foo.com/
-http://www.foo.com/default.php3 http://www.foo.com/
-http://www.foo.com/something.php3 http://www.foo.com/something.php3
-http://www.foo.com/something.html http://www.foo.com/something.html
-http://www.foo.com/something.asp http://www.foo.com/something.asp
-http://www.foo.com/index.phtml http://www.foo.com/
-http://www.foo.com/index.cfm http://www.foo.com/
-http://www.foo.com/index.cgi http://www.foo.com/
-http://www.foo.com/index.HTML http://www.foo.com/
-http://www.foo.com/index.Htm http://www.foo.com/
-http://www.foo.com/index.ASP http://www.foo.com/
-http://www.foo.com/index.jsp http://www.foo.com/
-http://www.foo.com/index.jsf http://www.foo.com/
-http://www.foo.com/index.jspx http://www.foo.com/
-http://www.foo.com/index.jspfx http://www.foo.com/index.jspfx
-http://www.foo.com/index.jspa http://www.foo.com/
-http://www.foo.com/index.jsps http://www.foo.com/index.jsps
-http://www.foo.com/index.aspX http://www.foo.com/
-http://www.foo.com/index.PhP http://www.foo.com/
-http://www.foo.com/index.PhP4 http://www.foo.com/
-http://www.foo.com/default.HTml http://www.foo.com/
-http://www.foo.com/default.HTm http://www.foo.com/
-http://www.foo.com/default.ASp http://www.foo.com/
-http://www.foo.com/default.AspX http://www.foo.com/
-http://www.foo.com/default.PHP http://www.foo.com/
-http://www.foo.com/default.PHP3 http://www.foo.com/
-http://www.foo.com/index.phtml http://www.foo.com/
-http://www.foo.com/index.cfm http://www.foo.com/
-http://www.foo.com/index.cgi http://www.foo.com/
-
-# ensure keeping non-default pages
-http://www.foo.com/foo.php3 http://www.foo.com/foo.php3
-http://www.foo.com/foo.html http://www.foo.com/foo.html
-http://www.foo.com/foo.asp http://www.foo.com/foo.asp
-
-# test removal of interpage anchors and keeping query string
-http://www.foo.com/foo.html#something http://www.foo.com/foo.html
-http://www.foo.com/foo.html#something?x=y http://www.foo.com/foo.html?x=y
-
-# test general cleaning of bad urls
-http://www.foo.com/foo.html?&x=y http://www.foo.com/foo.html?x=y
-http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a
-http://www.foo.com/foo.html? http://www.foo.com/foo.html
-
-# remove double slashes but keep 2 slashes after protocol
-http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html
-https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html
-
-# normalize file: protocol prefix (keep one slash)
-file:///path//foo.html file:/path/foo.html
-file:/path//foo.html file:/path/foo.html

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml 
b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml
deleted file mode 100644
index 4d6eabc..0000000
--- a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml
+++ /dev/null
@@ -1,66 +0,0 @@
-<?xml version="1.0"?>
-<!-- This is the configuration file for the RegexUrlNormalize Class.
-     This is intended so that users can specify substitutions to be
-     done on URLs. The regex engine that is used is Perl5 compatible.
-     The rules are applied to URLs in the order they occur in this file.  -->
-
-<!-- WATCH OUT: an xml parser reads this file an ampersands must be
-     expanded to &amp; -->
-
-<!-- The following rules show how to strip out session IDs, default pages, 
-     interpage anchors, etc. Order does matter!  -->
-<regex-normalize>
-
-<!-- removes session ids from urls (such as jsessionid and PHPSESSID) -->
-<regex>
-  
<pattern>(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
-  <substitution>$4</substitution>
-</regex>
-
-<!-- changes default pages into standard for /index.html, etc. into / -->
-<!-- these are commented in the default file but uncommented here for testing 
-->
-<regex>
-  
<pattern>/((?i)index|default)\.((?i)js[pf]{1}?[afx]?|cgi|cfm|asp[x]?|[psx]?htm[l]?|php[3456]?)(\?|&amp;|#|$)</pattern>
-  <substitution>/$3</substitution>
-</regex> 
-
-<!-- removes interpage href anchors such as site.com#location -->
-<regex>
-  <pattern>#.*?(\?|&amp;|$)</pattern>
-  <substitution>$1</substitution>
-</regex>
-
-<!-- cleans ?&var=value into ?var=value -->
-<regex>
-  <pattern>\?&amp;</pattern>
-  <substitution>\?</substitution>
-</regex>
-
-<!-- cleans multiple sequential ampersands into a single ampersand -->
-<regex>
-  <pattern>&amp;{2,}</pattern>
-  <substitution>&amp;</substitution>
-</regex>
-
-<!-- removes trailing ?, ampersands, . -->
-<regex>
-  <pattern>[\?&amp;\.]$</pattern>
-  <substitution></substitution>
-</regex>
-
-<!-- normalize file:/// protocol prefix: -->
-<!--  keep one single slash (NUTCH-1483) -->
-<regex>
-  <pattern>^file://+</pattern>
-  <substitution>file:/</substitution>
-</regex>
-
-<!-- removes duplicate slashes but -->
-<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
-<regex>
-  <pattern>(?&lt;!:)/{2,}</pattern>
-  <substitution>/</substitution>
-</regex>
-
-</regex-normalize>
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test 
b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test
deleted file mode 100644
index 9d92880..0000000
--- a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test
+++ /dev/null
@@ -1,8 +0,0 @@
-# test removal of subdomains
-http://www.foo.bar.com/ http://bar.com/
-
-# test removal of url path
-http://www.foo.bar.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 
http://bar.com/
-
-# test removal of urls in arguments
-https://www.foo.bar.com/foo.php?url=http://www.example.com/test.php 
https://bar.com/

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml 
b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml
deleted file mode 100644
index 3698968..0000000
--- a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-<?xml version="1.0"?>
-<!-- This is the configuration file for the RegexUrlNormalize Class.
-     This is intended so that users can specify substitutions to be
-     done on URLs. The regex engine that is used is Perl5 compatible.
-     The rules are applied to URLs in the order they occur in this file.  -->
-
-<!-- WATCH OUT: an xml parser reads this file an ampersands must be
-     expanded to &amp; -->
-
-<!--
-     The following rules show how to reduce urls so that
-     urls from the same domain are identical. This is useful
-     e.g. when calculating host counts, or splitting fetchlists.
--->
-<regex-normalize>
-<regex>
-  <pattern>(^[a-z]{3,5}://)([\w]+\.)*?(\w+\.\w+)[/$].*</pattern>
-  <substitution>$1$3/</substitution>
-</regex>
-</regex-normalize>
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.test
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.test
 
b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.test
new file mode 100644
index 0000000..7867ad8
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.test
@@ -0,0 +1,84 @@
+# test simple removal of session id, keeping parameters before and after
+http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 
http://foo.com/foo.php
+http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 
http://foo.com/foo.php?f=2
+http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3 
http://foo.com/foo.php?f=2&q=3
+http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2 
http://foo.com/foo.php?f=2
+
+# test removal of different session ids including removal of ; in jsessionid
+http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl 
http://www.foo.com/foo.php
+http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl&x=y 
http://www.foo.com/foo.php?x=y
+http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED 
http://www.foo.com/foo.html
+http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED
 http://www.foo.com/foo.html?param=1&another=2
+http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2
 http://www.foo.com/foo.html?param=1&another=2
+http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 
http://www.foo.com/foo.php?x=1&something=1
+http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 
http://www.foo.com/foo.html
+http://www.foo.com/foo.php?_sessionid=qmyrcedt&outputformat=html&path=/3_images/foo
 http://www.foo.com/foo.php?outputformat=html&path=/3_images/foo
+http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en&_sessionid=e36902d5bb2d0d922fc24b43
 http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en
+http://www.foo.com/foo.php?app=content&content=overview&lang=en&_sid=587fba8f825b05844526519fdb7d75c8&b=35&m=47
 http://www.foo.com/foo.php?app=content&content=overview&lang=en&b=35&m=47
+# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328)
+http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 
http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539
+http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0
 http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0
+
+# test removal default pages
+http://www.foo.com/home/index.html http://www.foo.com/home/
+http://www.foo.com/index.html http://www.foo.com/
+http://www.foo.com/index.htm http://www.foo.com/
+http://www.foo.com/index.asp http://www.foo.com/
+http://www.foo.com/index.aspx http://www.foo.com/
+http://www.foo.com/index.php http://www.foo.com/
+http://www.foo.com/index.php3 http://www.foo.com/
+http://www.foo.com/default.html http://www.foo.com/
+http://www.foo.com/default.htm http://www.foo.com/
+http://www.foo.com/default.asp http://www.foo.com/
+http://www.foo.com/default.aspx http://www.foo.com/
+http://www.foo.com/default.php http://www.foo.com/
+http://www.foo.com/default.php3 http://www.foo.com/
+http://www.foo.com/something.php3 http://www.foo.com/something.php3
+http://www.foo.com/something.html http://www.foo.com/something.html
+http://www.foo.com/something.asp http://www.foo.com/something.asp
+http://www.foo.com/index.phtml http://www.foo.com/
+http://www.foo.com/index.cfm http://www.foo.com/
+http://www.foo.com/index.cgi http://www.foo.com/
+http://www.foo.com/index.HTML http://www.foo.com/
+http://www.foo.com/index.Htm http://www.foo.com/
+http://www.foo.com/index.ASP http://www.foo.com/
+http://www.foo.com/index.jsp http://www.foo.com/
+http://www.foo.com/index.jsf http://www.foo.com/
+http://www.foo.com/index.jspx http://www.foo.com/
+http://www.foo.com/index.jspfx http://www.foo.com/index.jspfx
+http://www.foo.com/index.jspa http://www.foo.com/
+http://www.foo.com/index.jsps http://www.foo.com/index.jsps
+http://www.foo.com/index.aspX http://www.foo.com/
+http://www.foo.com/index.PhP http://www.foo.com/
+http://www.foo.com/index.PhP4 http://www.foo.com/
+http://www.foo.com/default.HTml http://www.foo.com/
+http://www.foo.com/default.HTm http://www.foo.com/
+http://www.foo.com/default.ASp http://www.foo.com/
+http://www.foo.com/default.AspX http://www.foo.com/
+http://www.foo.com/default.PHP http://www.foo.com/
+http://www.foo.com/default.PHP3 http://www.foo.com/
+http://www.foo.com/index.phtml http://www.foo.com/
+http://www.foo.com/index.cfm http://www.foo.com/
+http://www.foo.com/index.cgi http://www.foo.com/
+
+# ensure keeping non-default pages
+http://www.foo.com/foo.php3 http://www.foo.com/foo.php3
+http://www.foo.com/foo.html http://www.foo.com/foo.html
+http://www.foo.com/foo.asp http://www.foo.com/foo.asp
+
+# test removal of interpage anchors and keeping query string
+http://www.foo.com/foo.html#something http://www.foo.com/foo.html
+http://www.foo.com/foo.html#something?x=y http://www.foo.com/foo.html?x=y
+
+# test general cleaning of bad urls
+http://www.foo.com/foo.html?&x=y http://www.foo.com/foo.html?x=y
+http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a
+http://www.foo.com/foo.html? http://www.foo.com/foo.html
+
+# remove double slashes but keep 2 slashes after protocol
+http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html
+https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html
+
+# normalize file: protocol prefix (keep one slash)
+file:///path//foo.html file:/path/foo.html
+file:/path//foo.html file:/path/foo.html

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.xml
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.xml
 
b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.xml
new file mode 100644
index 0000000..4d6eabc
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.xml
@@ -0,0 +1,66 @@
+<?xml version="1.0"?>
+<!-- This is the configuration file for the RegexUrlNormalize Class.
+     This is intended so that users can specify substitutions to be
+     done on URLs. The regex engine that is used is Perl5 compatible.
+     The rules are applied to URLs in the order they occur in this file.  -->
+
+<!-- WATCH OUT: an xml parser reads this file an ampersands must be
+     expanded to &amp; -->
+
+<!-- The following rules show how to strip out session IDs, default pages, 
+     interpage anchors, etc. Order does matter!  -->
+<regex-normalize>
+
+<!-- removes session ids from urls (such as jsessionid and PHPSESSID) -->
+<regex>
+  
<pattern>(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
+  <substitution>$4</substitution>
+</regex>
+
+<!-- changes default pages into standard for /index.html, etc. into / -->
+<!-- these are commented in the default file but uncommented here for testing 
-->
+<regex>
+  
<pattern>/((?i)index|default)\.((?i)js[pf]{1}?[afx]?|cgi|cfm|asp[x]?|[psx]?htm[l]?|php[3456]?)(\?|&amp;|#|$)</pattern>
+  <substitution>/$3</substitution>
+</regex> 
+
+<!-- removes interpage href anchors such as site.com#location -->
+<regex>
+  <pattern>#.*?(\?|&amp;|$)</pattern>
+  <substitution>$1</substitution>
+</regex>
+
+<!-- cleans ?&var=value into ?var=value -->
+<regex>
+  <pattern>\?&amp;</pattern>
+  <substitution>\?</substitution>
+</regex>
+
+<!-- cleans multiple sequential ampersands into a single ampersand -->
+<regex>
+  <pattern>&amp;{2,}</pattern>
+  <substitution>&amp;</substitution>
+</regex>
+
+<!-- removes trailing ?, ampersands, . -->
+<regex>
+  <pattern>[\?&amp;\.]$</pattern>
+  <substitution></substitution>
+</regex>
+
+<!-- normalize file:/// protocol prefix: -->
+<!--  keep one single slash (NUTCH-1483) -->
+<regex>
+  <pattern>^file://+</pattern>
+  <substitution>file:/</substitution>
+</regex>
+
+<!-- removes duplicate slashes but -->
+<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
+<regex>
+  <pattern>(?&lt;!:)/{2,}</pattern>
+  <substitution>/</substitution>
+</regex>
+
+</regex-normalize>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.test
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.test
 
b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.test
new file mode 100644
index 0000000..9d92880
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.test
@@ -0,0 +1,8 @@
+# test removal of subdomains
+http://www.foo.bar.com/ http://bar.com/
+
+# test removal of url path
+http://www.foo.bar.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 
http://bar.com/
+
+# test removal of urls in arguments
+https://www.foo.bar.com/foo.php?url=http://www.example.com/test.php 
https://bar.com/

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.xml
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.xml
 
b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.xml
new file mode 100644
index 0000000..3698968
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+<!-- This is the configuration file for the RegexUrlNormalize Class.
+     This is intended so that users can specify substitutions to be
+     done on URLs. The regex engine that is used is Perl5 compatible.
+     The rules are applied to URLs in the order they occur in this file.  -->
+
+<!-- WATCH OUT: an xml parser reads this file an ampersands must be
+     expanded to &amp; -->
+
+<!--
+     The following rules show how to reduce urls so that
+     urls from the same domain are identical. This is useful
+     e.g. when calculating host counts, or splitting fetchlists.
+-->
+<regex-normalize>
+<regex>
+  <pattern>(^[a-z]{3,5}://)([\w]+\.)*?(\w+\.\w+)[/$].*</pattern>
+  <substitution>$1$3/</substitution>
+</regex>
+</regex-normalize>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-slash/data/slashes.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-slash/data/slashes.txt 
b/nutch-plugins/urlnormalizer-slash/data/slashes.txt
deleted file mode 100644
index d3bd70a..0000000
--- a/nutch-plugins/urlnormalizer-slash/data/slashes.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# Both domains have duplicate URL's, some with slashes and some without
-
-# We prefer this domain with slashes
-www.example.org +
-
-# ..but this domain without
-www.example.net -
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-slash/src/test/resources/slashes.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-slash/src/test/resources/slashes.txt 
b/nutch-plugins/urlnormalizer-slash/src/test/resources/slashes.txt
new file mode 100644
index 0000000..d3bd70a
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-slash/src/test/resources/slashes.txt
@@ -0,0 +1,7 @@
+# Both domains have duplicate URL's, some with slashes and some without
+
+# We prefer this domain with slashes
+www.example.org +
+
+# ..but this domain without
+www.example.net -
\ No newline at end of file

[67/69] [abbrv] nutch git commit: Moved test resources to maven's test resources directory

Reply via email to