http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls b/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls deleted file mode 100644 index 40bf4ee..0000000 --- a/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls +++ /dev/null @@ -1,297 +0,0 @@ -+http://www.hostip.info/ --http://www.elanceur.org/Articles/OntologieSurfaite.html -+http://www.opensymphony.com/quartz/ --http://www.portletbridge.org/saxbenchmark/index.html -+http://www.lesmotsdelinfo.com/ -+http://usefulinc.com/doap/ -+http://www.codezoo.com/ -+http://search.infocious.com/ --http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html -+http://www.brics.dk/%7Eamoeller/automaton/ -+http://jazzz.com/wp.html -+http://www.maxkiesler.com/index.php -+http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html -+http://www.alias-i.com/lingpipe/ --http://johnny.ihackstuff.com/index.php?module=prodreviews --http://www.spurl.net/ -+http://www.dropload.com/ -+http://vivisimo.com/ -+http://www.marumushi.com/apps/newsmap/newsmap.cfm -+http://www.ixquick.com/ --http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html -+http://www.mail-archive.com/ -+http://www.spymac.com/ --http://browsers.evolt.org/ --http://www.oswd.org/ -+http://www.stayinvisible.com/index.pl -+http://java.sun.com/j2se/1.4.2/docs/api/index.html -+http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx -+http://www.bloglines.com/ --http://www.fckeditor.net/ -+http://search.msn.com/ --http://www.grub.org/ -+http://www.xml.com/pub/a/2000/11/29/schemas/part1.html --http://www.mnot.net/cache_docs/ --http://www.furl.net/ -+http://www.blogpulse.com/ -+http://www.googlefight.com/ -+http://www.rokulabs.com/ --http://mightylegends.zapto.org/dvd/dvdauthor_howto.php --http://www.batbox.org/wrt54g-linux.html --http://en.wikipedia.org/wiki/%s -+http://www.sipcenter.com/ -+http://www.merriampark.com/ld.htm -+http://anon.inf.tu-dresden.de/index_en.html -+http://www.pluck.com/ -+http://www.tiddlywiki.com/ -+http://www.jux2.com/ -+http://clusty.com/ --http://findability.org/ -+http://www.searchengineshowdown.com/ -+http://www.nhacks.com/email/index.php -+http://www.koders.com/ -+http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf -+http://www.gmailwiki.com/index.php/Main_Page -+http://www.tadalist.com/ -+http://www.net2ftp.com/ -+http://www.streamload.com/ -+http://www.lucazappa.com/brilliantMaker/buttonImage.php -+http://www.hybernaut.com/bdv/delicious-import.html -+http://www.gtmcknight.com/buttons/ -+http://amb.vis.ne.jp/mozilla/scrapbook/ -+http://g-metrics.com/index.php --http://tor.eff.org/ -+http://www.search-this.com/search_engine_decoder.asp -+http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html -+http://www.adaptivepath.com/publications/essays/archives/000385.php --http://isnoop.net/gmail/ --http://openweb.eu.org/ -+http://www.mistergooddeal.com/ -+http://javatoolbox.com/ --http://www.freenews.fr/ -+http://www.wikiwax.com/ --http://today.java.net/pub/a/today/2005/04/21/farm.html -+http://users.skynet.be/J.Beever/pave.htm -+http://www.lundi8h.com/ -+http://www.snap.com/ -+http://www.goosee.com/puppy/index.shtml --http://www.softwarefreedom.org/index.html --http://y.20q.net/ -+http://www.bitty.com/ -+http://www.lafraise.com/ --http://www.liquidinformation.org/ -+http://www.searchtools.com/ -+http://www.martinfowler.com/articles/injection.html -+http://pdos.csail.mit.edu/scigen/ --http://developer.yahoo.net/blog/ -+http://blogger-templates.blogspot.com/ -+http://phpadsnew.com/two/ -+http://www.langreiter.com/exec/yahoo-vs-google.html --http://www.dataparksearch.org/ --http://www.yubnub.org/ --http://www.fing.org/ --http://www.swish-e.org/ --http://www.openajax.net/wordpress/ -+http://crypto.stanford.edu/PwdHash/ -+http://www.html-kit.com/favicon/ --http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1 -+http://www.durhamtownship.com/ -+http://jiwire.com/ -+http://www.insilmaril.de/vym/ --http://www.spreadshirt.net/ -+http://www.goffice.com/ -+http://www.writely.com/ -+http://www.milindparikh.com/ -+http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html -+http://www.wikyblog.com/Map/Guest/Home --http://www.kottke.org/05/08/googleos-webos -+http://www.rollyo.com/ -+http://www.meebo.com/ -+http://www.factbites.com/ -+http://www.placeopedia.com/ -+http://swoogle.umbc.edu/ -+http://www.viaduc.com/ --http://demo.wikiwyg.net/wikiwyg/demo/standalone/ -+http://podcasts.yahoo.com/ --http://beaglewiki.org/Main_Page -+http://yq.search.yahoo.com/ --http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1 -+http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html -+http://socialight.com/ -+http://www.lexxe.com/ -+http://www.xom.nu/ -+http://www.turboprint.de/ -+http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27 -+http://www.wi-fiplanet.com/tutorials/article.php/3562391 -+http://particletree.com/features/10-tips-to-a-better-form/ -+http://www.songbirdnest.com/ --http://www.w3.org/Talks/Tools/Slidy/ --http://www.compassframework.org/display/SITE/Home -+http://motrech.blogspot.com/ -+http://www.moteurzine.com/ -+http://www.mex-search.com/ --http://beta.previewseek.com/?mdc=y&twin=n&ilang=french -+http://www.goshme.com/ -+http://rialto.application-servers.com/ -+http://www.multe-pass.com/ -+http://www.tailrank.com/ -+http://www.vandertramp.com/INTERNETDOWN/ -+http://www.letterjames.de/index.html -+http://code.google.com/index.html -+http://www.kritx.com/ -+http://performancing.com/firefox -+http://www.mywebsearch.com/ --http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1 -+http://www.lukew.com/resources/articles/blogs2.asp --http://www.hyperwords.net/ -+http://ajax.parish.ath.cx/translator/ -+http://www.maplandia.com/ --http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages -+http://onefeed.com/index.php -+http://www.file-swap.com/ --http://opennlp.org/ -+http://mindprod.com/jgloss/encoding.html -+http://code.google.com/webstats/index.html -+http://www.freeweb-hosting.com/google_pagerank_pr_checker/ --http://www.framakey.org/ --http://microformats.org/wiki/hreview --http://www.ashesandsnow.org/index2.html --http://uima-framework.sourceforge.net/ -+http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html --http://www.anandtech.com/IT/showdoc.aspx?i=2523&p=2 -+http://fr.techcrunch.com/ --http://developer.yahoo.net/yui/ -+http://www.fredrikodman.com/ -+http://www.mpirical.com/companion/mpirical_companion.html -+http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html --http://k9copy.free.fr/ --http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 --http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design --http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2 -+http://blogokat.canalblog.com/archives/2005/11/02/882454.html -+http://robur.slu.se/jensl/xmlclitools/ --http://www.internetactu.net/?p=6291 --http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1 -+http://www.memodata.com/2004/fr/alexandria/ --http://presse-citron.net/?2006/01/23/654-joomla-pete-grave -+http://www.randomerror.com/ -+http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/ --http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395 --http://interstices.info/display.jsp?id=c_15918 -+http://www.tech-invite.com/ -+http://www.croczilla.com/zap --http://www.libervis.com/modules/wordpress/?p=13 -+http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/ --http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm -+http://www.influo.com/ -+http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html --http://www.addnb.org/fr/docs/webinvisible.htm --http://manhack.net/ --http://www.jibaku.net/ -+http://www.pipologie.com/ -+http://christophenoel.blogspot.com/ --http://www.seekport.fr/seekbot/ -+http://beta.exalead.com/ --http://www.boolgum.fr/index.html -+http://www.kesako.canalblog.com/ -+http://loran.blogspot.com/ -+http://outils-recherche.blogspot.com/ -+http://www.art-dept.com/artists/giacobbe/ -+http://www.meggould.netfirms.com/site_seeingIII.htm -+http://www.freedpi.com/ -+http://www.frenchfred.com/ -+http://www.photoways.com/ --http://freco.free.fr/index.htm --http://triturages.free.fr/index.htm --http://www.qsos.org/ -+http://www.alvis.info/alvis/ -+http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/ --http://www.shinux.org/ -+http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml -+http://www.kurobox.com/online/tiki-index.php --http://news.gmane.org/gmane.comp.misc.linkstation.linux -+http://www.imsbook.com/SIP-IMS-Standards-List.html --http://incubator.apache.org/directory/subprojects/snickers/ --http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html --http://sourceforge.net/projects/cryptix-asn1/ --http://sourceforge.net/projects/basn/ --http://asn1.elibel.tm.fr/fr/index.htm --http://sourceforge.net/projects/a2j/ -+http://www.degrouptest.com/ -+http://interstices.info/ -+http://louvre-boite.viabloga.com/news/18.shtml --http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html -+http://poiplace.oabsoftware.nl/ --http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759 --http://www.yoono.com/favorites.jsp?user-id=lquerel --http://www.librecours.org/cgi-bin/main --http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1 --http://limo.sourceforge.net/ -+http://www-scf.usc.edu/%7Emattmann/ -+http://spaces.msn.com/members/famillezen/ --http://photos.joune.org/ --http://www.canon.fr/paperart/ -+http://flash.eastweb.ru/files/20051024092150.swf -+http://www.xsltwiki.com/index.php/Main_Page -+http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/ --http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31 -+http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html --http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/ -+http://www.aeliosfinance.com/ -+http://www.capital-it.com/ --http://www.tradedoubler.fr/pan/public/solutions/publisher --http://www.recherche.gouv.fr/technologie/concours/2006/index.htm -+http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/ -+http://wanabo.com/ --http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1 --http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam -+http://aeliosfinance.com/ -+http://www.centreincubation.com/ -+http://www.franceincubation.com/ --http://www.oseo.fr/ -+http://www.i18nfaq.com/chardet.html --http://cpdetector.sourceforge.net/ -+http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles -+http://chezlorry.ca/Accueil.htm -+http://cetnia.blogs.com/d_lires/ --http://www.directwine.fr/ -+http://www.new-phenix.com/ --http://upnp.sourceforge.net/ --http://www.pixmania.fr/ --http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 -+http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/ -+http://www.stepnewz.com/sn/default.asp -+http://opquast.com/ --http://www.freeplayer.org/ --http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie --http://atomcomputer.free.fr/fbox/ --http://www.internetactu.net/index.php?p=6100 --http://mammouthland.free.fr/cours/css/genecss.php --http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1 -+http://www-106.ibm.com/developerworks/xml/library/x-xapi.html --http://xml.apache.org/xalan-j/extensions.html -+http://developers.sun.com/foryourbusiness/jcc/ -+http://blogs.sun.com/roller/page/roumen/Weblog --http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1 --http://blog.developpez.com/index.php?blog=51&p=1389&more=1&c=1&tb=1&pb=1 -+http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/ -+http://odur.let.rug.nl/%7Evannoord/ --http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html --http://artist.inist.fr/ -+http://www.elra.info/ --http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO -+http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability -+http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval -+http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/ -+http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/ -+http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/ -+http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/ -+http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/ -+http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html --http://www.lexique.org/ -+http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/ -+http://www.streamium.com/products/mx6000i/ --http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr --http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73 -+http://www.tversity.com/ --http://www.aspseek.org/index.php \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules b/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules deleted file mode 100644 index 705bdb2..0000000 --- a/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules +++ /dev/null @@ -1,27 +0,0 @@ -# The url filter file used by the crawl command. - -# Better for intranet crawling. -# Be sure to change MY.DOMAIN.NAME to your domain name. - -# Each non-comment, non-blank line contains a regular expression -# prefixed by '+' or '-'. The first matching pattern in the file -# determines whether a URL is included or ignored. If no pattern -# matches, the URL is ignored. - -# skip file:, ftp:, & mailto: urls --^(file|ftp|mailto): - -# skip image and other suffixes we can't yet parse --\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$ - -# skip URLs containing certain characters as probable queries, etc. --[?*!@=] - -# skip URLs with slash-delimited segment that repeats 3+ times, to break loops --.*(/.+?)/.*?\1/.*?\1/ - -# accept hosts in MY.DOMAIN.NAME -+^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/ - -# skip everything else --. http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls b/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls deleted file mode 100644 index b1ad9b7..0000000 --- a/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls +++ /dev/null @@ -1,8 +0,0 @@ --file://home/jc/nutch/index.html --ftp://ftp.apache.org/nutch.html --mailto:[email protected] --news://any.news.server/comp.lang.java --whois:/nutch.org -+http://MY.DOMAIN.NAME/ -+http://MY.DOMAIN.NAME/nutch -+http://www.MY.DOMAIN.NAME/ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules b/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules deleted file mode 100644 index 8778921..0000000 --- a/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules +++ /dev/null @@ -1,22 +0,0 @@ -# The default url filter. -# Better for whole-internet crawling. - -# Each non-comment, non-blank line contains a regular expression -# prefixed by '+' or '-'. The first matching pattern in the file -# determines whether a URL is included or ignored. If no pattern -# matches, the URL is ignored. - -# skip file: ftp: and mailto: urls --^(file|ftp|mailto): - -# skip image and other suffixes we can't yet parse --\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$ - -# skip URLs containing certain characters as probable queries, etc. --[?*!@=] - -# skip URLs with slash-delimited segment that repeats 3+ times, to break loops --.*(/.+?)/.*?\1/.*?\1/ - -# accept anything else -+. http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls b/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls deleted file mode 100644 index ccb6269..0000000 --- a/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls +++ /dev/null @@ -1,11 +0,0 @@ --file://home/jc/nutch/index.html --ftp://ftp.apache.org/nutch.html --mailto:[email protected] -+news://any.news.server/comp.lang.java -+whois:/nutch.org --http://www.nutch.org/nutch.gif --http://www.nutch.org/nutch.eps --http://www.nutch.org/nutch?q=nutch -+http://www.nutch.org/ --http://www.nutch.org/abcd/foo/bar/foo/bar/foo/ --http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/nutch1838.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/sample/nutch1838.rules b/nutch-plugins/urlfilter-regex/sample/nutch1838.rules deleted file mode 100644 index f7b0d13..0000000 --- a/nutch-plugins/urlfilter-regex/sample/nutch1838.rules +++ /dev/null @@ -1,12 +0,0 @@ -# Skip all url's containing skip for example.org -> www.example.org --skip -< - -# Allow all url's containing skip for example.com -> www.example.com -+skip -< - -# Skip everything else --. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/nutch1838.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/sample/nutch1838.urls b/nutch-plugins/urlfilter-regex/sample/nutch1838.urls deleted file mode 100644 index c6f29d1..0000000 --- a/nutch-plugins/urlfilter-regex/sample/nutch1838.urls +++ /dev/null @@ -1,3 +0,0 @@ --http://www.example.org/skip-me-now -+http://www.example.com/noone-can-skip-me --http://www.example.nl/i-am-filtered \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules new file mode 100644 index 0000000..c8901e2 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules @@ -0,0 +1,26 @@ +# The url filter file used by the crawl command. + +# Better for intranet crawling. +# Be sure to change MY.DOMAIN.NAME to your domain name. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-^(file|ftp|mailto): + +# skip image and other suffixes we can't yet parse +-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$ + +# skip URLs containing certain characters as probable queries, etc. +-[?*!@=] + +# skip .fr .org and .net domains +-^.*//.*\.fr/ +-^.*//.*\.org/ +-^.*//.*\.net/ + +# skip everything else ++. http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls new file mode 100644 index 0000000..40bf4ee --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls @@ -0,0 +1,297 @@ ++http://www.hostip.info/ +-http://www.elanceur.org/Articles/OntologieSurfaite.html ++http://www.opensymphony.com/quartz/ +-http://www.portletbridge.org/saxbenchmark/index.html ++http://www.lesmotsdelinfo.com/ ++http://usefulinc.com/doap/ ++http://www.codezoo.com/ ++http://search.infocious.com/ +-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html ++http://www.brics.dk/%7Eamoeller/automaton/ ++http://jazzz.com/wp.html ++http://www.maxkiesler.com/index.php ++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html ++http://www.alias-i.com/lingpipe/ +-http://johnny.ihackstuff.com/index.php?module=prodreviews +-http://www.spurl.net/ ++http://www.dropload.com/ ++http://vivisimo.com/ ++http://www.marumushi.com/apps/newsmap/newsmap.cfm ++http://www.ixquick.com/ +-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html ++http://www.mail-archive.com/ ++http://www.spymac.com/ +-http://browsers.evolt.org/ +-http://www.oswd.org/ ++http://www.stayinvisible.com/index.pl ++http://java.sun.com/j2se/1.4.2/docs/api/index.html ++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx ++http://www.bloglines.com/ +-http://www.fckeditor.net/ ++http://search.msn.com/ +-http://www.grub.org/ ++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html +-http://www.mnot.net/cache_docs/ +-http://www.furl.net/ ++http://www.blogpulse.com/ ++http://www.googlefight.com/ ++http://www.rokulabs.com/ +-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php +-http://www.batbox.org/wrt54g-linux.html +-http://en.wikipedia.org/wiki/%s ++http://www.sipcenter.com/ ++http://www.merriampark.com/ld.htm ++http://anon.inf.tu-dresden.de/index_en.html ++http://www.pluck.com/ ++http://www.tiddlywiki.com/ ++http://www.jux2.com/ ++http://clusty.com/ +-http://findability.org/ ++http://www.searchengineshowdown.com/ ++http://www.nhacks.com/email/index.php ++http://www.koders.com/ ++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf ++http://www.gmailwiki.com/index.php/Main_Page ++http://www.tadalist.com/ ++http://www.net2ftp.com/ ++http://www.streamload.com/ ++http://www.lucazappa.com/brilliantMaker/buttonImage.php ++http://www.hybernaut.com/bdv/delicious-import.html ++http://www.gtmcknight.com/buttons/ ++http://amb.vis.ne.jp/mozilla/scrapbook/ ++http://g-metrics.com/index.php +-http://tor.eff.org/ ++http://www.search-this.com/search_engine_decoder.asp ++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html ++http://www.adaptivepath.com/publications/essays/archives/000385.php +-http://isnoop.net/gmail/ +-http://openweb.eu.org/ ++http://www.mistergooddeal.com/ ++http://javatoolbox.com/ +-http://www.freenews.fr/ ++http://www.wikiwax.com/ +-http://today.java.net/pub/a/today/2005/04/21/farm.html ++http://users.skynet.be/J.Beever/pave.htm ++http://www.lundi8h.com/ ++http://www.snap.com/ ++http://www.goosee.com/puppy/index.shtml +-http://www.softwarefreedom.org/index.html +-http://y.20q.net/ ++http://www.bitty.com/ ++http://www.lafraise.com/ +-http://www.liquidinformation.org/ ++http://www.searchtools.com/ ++http://www.martinfowler.com/articles/injection.html ++http://pdos.csail.mit.edu/scigen/ +-http://developer.yahoo.net/blog/ ++http://blogger-templates.blogspot.com/ ++http://phpadsnew.com/two/ ++http://www.langreiter.com/exec/yahoo-vs-google.html +-http://www.dataparksearch.org/ +-http://www.yubnub.org/ +-http://www.fing.org/ +-http://www.swish-e.org/ +-http://www.openajax.net/wordpress/ ++http://crypto.stanford.edu/PwdHash/ ++http://www.html-kit.com/favicon/ +-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1 ++http://www.durhamtownship.com/ ++http://jiwire.com/ ++http://www.insilmaril.de/vym/ +-http://www.spreadshirt.net/ ++http://www.goffice.com/ ++http://www.writely.com/ ++http://www.milindparikh.com/ ++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html ++http://www.wikyblog.com/Map/Guest/Home +-http://www.kottke.org/05/08/googleos-webos ++http://www.rollyo.com/ ++http://www.meebo.com/ ++http://www.factbites.com/ ++http://www.placeopedia.com/ ++http://swoogle.umbc.edu/ ++http://www.viaduc.com/ +-http://demo.wikiwyg.net/wikiwyg/demo/standalone/ ++http://podcasts.yahoo.com/ +-http://beaglewiki.org/Main_Page ++http://yq.search.yahoo.com/ +-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1 ++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html ++http://socialight.com/ ++http://www.lexxe.com/ ++http://www.xom.nu/ ++http://www.turboprint.de/ ++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27 ++http://www.wi-fiplanet.com/tutorials/article.php/3562391 ++http://particletree.com/features/10-tips-to-a-better-form/ ++http://www.songbirdnest.com/ +-http://www.w3.org/Talks/Tools/Slidy/ +-http://www.compassframework.org/display/SITE/Home ++http://motrech.blogspot.com/ ++http://www.moteurzine.com/ ++http://www.mex-search.com/ +-http://beta.previewseek.com/?mdc=y&twin=n&ilang=french ++http://www.goshme.com/ ++http://rialto.application-servers.com/ ++http://www.multe-pass.com/ ++http://www.tailrank.com/ ++http://www.vandertramp.com/INTERNETDOWN/ ++http://www.letterjames.de/index.html ++http://code.google.com/index.html ++http://www.kritx.com/ ++http://performancing.com/firefox ++http://www.mywebsearch.com/ +-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1 ++http://www.lukew.com/resources/articles/blogs2.asp +-http://www.hyperwords.net/ ++http://ajax.parish.ath.cx/translator/ ++http://www.maplandia.com/ +-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages ++http://onefeed.com/index.php ++http://www.file-swap.com/ +-http://opennlp.org/ ++http://mindprod.com/jgloss/encoding.html ++http://code.google.com/webstats/index.html ++http://www.freeweb-hosting.com/google_pagerank_pr_checker/ +-http://www.framakey.org/ +-http://microformats.org/wiki/hreview +-http://www.ashesandsnow.org/index2.html +-http://uima-framework.sourceforge.net/ ++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html +-http://www.anandtech.com/IT/showdoc.aspx?i=2523&p=2 ++http://fr.techcrunch.com/ +-http://developer.yahoo.net/yui/ ++http://www.fredrikodman.com/ ++http://www.mpirical.com/companion/mpirical_companion.html ++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html +-http://k9copy.free.fr/ +-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 +-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design +-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2 ++http://blogokat.canalblog.com/archives/2005/11/02/882454.html ++http://robur.slu.se/jensl/xmlclitools/ +-http://www.internetactu.net/?p=6291 +-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1 ++http://www.memodata.com/2004/fr/alexandria/ +-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave ++http://www.randomerror.com/ ++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/ +-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395 +-http://interstices.info/display.jsp?id=c_15918 ++http://www.tech-invite.com/ ++http://www.croczilla.com/zap +-http://www.libervis.com/modules/wordpress/?p=13 ++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/ +-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm ++http://www.influo.com/ ++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html +-http://www.addnb.org/fr/docs/webinvisible.htm +-http://manhack.net/ +-http://www.jibaku.net/ ++http://www.pipologie.com/ ++http://christophenoel.blogspot.com/ +-http://www.seekport.fr/seekbot/ ++http://beta.exalead.com/ +-http://www.boolgum.fr/index.html ++http://www.kesako.canalblog.com/ ++http://loran.blogspot.com/ ++http://outils-recherche.blogspot.com/ ++http://www.art-dept.com/artists/giacobbe/ ++http://www.meggould.netfirms.com/site_seeingIII.htm ++http://www.freedpi.com/ ++http://www.frenchfred.com/ ++http://www.photoways.com/ +-http://freco.free.fr/index.htm +-http://triturages.free.fr/index.htm +-http://www.qsos.org/ ++http://www.alvis.info/alvis/ ++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/ +-http://www.shinux.org/ ++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml ++http://www.kurobox.com/online/tiki-index.php +-http://news.gmane.org/gmane.comp.misc.linkstation.linux ++http://www.imsbook.com/SIP-IMS-Standards-List.html +-http://incubator.apache.org/directory/subprojects/snickers/ +-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html +-http://sourceforge.net/projects/cryptix-asn1/ +-http://sourceforge.net/projects/basn/ +-http://asn1.elibel.tm.fr/fr/index.htm +-http://sourceforge.net/projects/a2j/ ++http://www.degrouptest.com/ ++http://interstices.info/ ++http://louvre-boite.viabloga.com/news/18.shtml +-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html ++http://poiplace.oabsoftware.nl/ +-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759 +-http://www.yoono.com/favorites.jsp?user-id=lquerel +-http://www.librecours.org/cgi-bin/main +-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1 +-http://limo.sourceforge.net/ ++http://www-scf.usc.edu/%7Emattmann/ ++http://spaces.msn.com/members/famillezen/ +-http://photos.joune.org/ +-http://www.canon.fr/paperart/ ++http://flash.eastweb.ru/files/20051024092150.swf ++http://www.xsltwiki.com/index.php/Main_Page ++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/ +-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31 ++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html +-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/ ++http://www.aeliosfinance.com/ ++http://www.capital-it.com/ +-http://www.tradedoubler.fr/pan/public/solutions/publisher +-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm ++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/ ++http://wanabo.com/ +-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1 +-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam ++http://aeliosfinance.com/ ++http://www.centreincubation.com/ ++http://www.franceincubation.com/ +-http://www.oseo.fr/ ++http://www.i18nfaq.com/chardet.html +-http://cpdetector.sourceforge.net/ ++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles ++http://chezlorry.ca/Accueil.htm ++http://cetnia.blogs.com/d_lires/ +-http://www.directwine.fr/ ++http://www.new-phenix.com/ +-http://upnp.sourceforge.net/ +-http://www.pixmania.fr/ +-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 ++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/ ++http://www.stepnewz.com/sn/default.asp ++http://opquast.com/ +-http://www.freeplayer.org/ +-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie +-http://atomcomputer.free.fr/fbox/ +-http://www.internetactu.net/index.php?p=6100 +-http://mammouthland.free.fr/cours/css/genecss.php +-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1 ++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html +-http://xml.apache.org/xalan-j/extensions.html ++http://developers.sun.com/foryourbusiness/jcc/ ++http://blogs.sun.com/roller/page/roumen/Weblog +-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1 +-http://blog.developpez.com/index.php?blog=51&p=1389&more=1&c=1&tb=1&pb=1 ++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/ ++http://odur.let.rug.nl/%7Evannoord/ +-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html +-http://artist.inist.fr/ ++http://www.elra.info/ +-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO ++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability ++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval ++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/ ++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/ ++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/ ++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/ ++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/ ++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html +-http://www.lexique.org/ ++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/ ++http://www.streamium.com/products/mx6000i/ +-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr +-http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73 ++http://www.tversity.com/ +-http://www.aspseek.org/index.php \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules b/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules new file mode 100644 index 0000000..705bdb2 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules @@ -0,0 +1,27 @@ +# The url filter file used by the crawl command. + +# Better for intranet crawling. +# Be sure to change MY.DOMAIN.NAME to your domain name. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-^(file|ftp|mailto): + +# skip image and other suffixes we can't yet parse +-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$ + +# skip URLs containing certain characters as probable queries, etc. +-[?*!@=] + +# skip URLs with slash-delimited segment that repeats 3+ times, to break loops +-.*(/.+?)/.*?\1/.*?\1/ + +# accept hosts in MY.DOMAIN.NAME ++^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/ + +# skip everything else +-. http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.urls b/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.urls new file mode 100644 index 0000000..b1ad9b7 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.urls @@ -0,0 +1,8 @@ +-file://home/jc/nutch/index.html +-ftp://ftp.apache.org/nutch.html +-mailto:[email protected] +-news://any.news.server/comp.lang.java +-whois:/nutch.org ++http://MY.DOMAIN.NAME/ ++http://MY.DOMAIN.NAME/nutch ++http://www.MY.DOMAIN.NAME/ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.rules b/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.rules new file mode 100644 index 0000000..8778921 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.rules @@ -0,0 +1,22 @@ +# The default url filter. +# Better for whole-internet crawling. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file: ftp: and mailto: urls +-^(file|ftp|mailto): + +# skip image and other suffixes we can't yet parse +-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$ + +# skip URLs containing certain characters as probable queries, etc. +-[?*!@=] + +# skip URLs with slash-delimited segment that repeats 3+ times, to break loops +-.*(/.+?)/.*?\1/.*?\1/ + +# accept anything else ++. http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.urls b/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.urls new file mode 100644 index 0000000..ccb6269 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/resources/WholeWebCrawling.urls @@ -0,0 +1,11 @@ +-file://home/jc/nutch/index.html +-ftp://ftp.apache.org/nutch.html +-mailto:[email protected] ++news://any.news.server/comp.lang.java ++whois:/nutch.org +-http://www.nutch.org/nutch.gif +-http://www.nutch.org/nutch.eps +-http://www.nutch.org/nutch?q=nutch ++http://www.nutch.org/ +-http://www.nutch.org/abcd/foo/bar/foo/bar/foo/ +-http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.rules b/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.rules new file mode 100644 index 0000000..f7b0d13 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.rules @@ -0,0 +1,12 @@ +# Skip all url's containing skip for example.org +> www.example.org +-skip +< + +# Allow all url's containing skip for example.com +> www.example.com ++skip +< + +# Skip everything else +-. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.urls b/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.urls new file mode 100644 index 0000000..c6f29d1 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/resources/nutch1838.urls @@ -0,0 +1,3 @@ +-http://www.example.org/skip-me-now ++http://www.example.com/noone-can-skip-me +-http://www.example.nl/i-am-filtered \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-host/data/hosts.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/data/hosts.txt b/nutch-plugins/urlnormalizer-host/data/hosts.txt deleted file mode 100644 index c7e0ccf..0000000 --- a/nutch-plugins/urlnormalizer-host/data/hosts.txt +++ /dev/null @@ -1,8 +0,0 @@ -# Force all sub domains to www. -*.example.com example.com - -# Force no sub domain to www. URL's -www.example.net example.net - -# Force www. sub domain when hitting link without sub domain -example.org www.example.org \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-host/src/test/resources/hosts.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/src/test/resources/hosts.txt b/nutch-plugins/urlnormalizer-host/src/test/resources/hosts.txt new file mode 100644 index 0000000..c7e0ccf --- /dev/null +++ b/nutch-plugins/urlnormalizer-host/src/test/resources/hosts.txt @@ -0,0 +1,8 @@ +# Force all sub domains to www. +*.example.com example.com + +# Force no sub domain to www. URL's +www.example.net example.net + +# Force www. sub domain when hitting link without sub domain +example.org www.example.org \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-protocol/data/protocols.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/data/protocols.txt b/nutch-plugins/urlnormalizer-protocol/data/protocols.txt deleted file mode 100644 index 7091cd7..0000000 --- a/nutch-plugins/urlnormalizer-protocol/data/protocols.txt +++ /dev/null @@ -1,7 +0,0 @@ -# format: host\tprotocol\n - -example.org http -example.net http - -example.io https -example.nl https http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-protocol/src/test/resources/protocols.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/src/test/resources/protocols.txt b/nutch-plugins/urlnormalizer-protocol/src/test/resources/protocols.txt new file mode 100644 index 0000000..7091cd7 --- /dev/null +++ b/nutch-plugins/urlnormalizer-protocol/src/test/resources/protocols.txt @@ -0,0 +1,7 @@ +# format: host\tprotocol\n + +example.org http +example.net http + +example.io https +example.nl https http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test deleted file mode 100644 index 7867ad8..0000000 --- a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test +++ /dev/null @@ -1,84 +0,0 @@ -# test simple removal of session id, keeping parameters before and after -http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php -http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php?f=2 -http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3 http://foo.com/foo.php?f=2&q=3 -http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2 http://foo.com/foo.php?f=2 - -# test removal of different session ids including removal of ; in jsessionid -http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl http://www.foo.com/foo.php -http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl&x=y http://www.foo.com/foo.php?x=y -http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html -http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html?param=1&another=2 -http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2 http://www.foo.com/foo.html?param=1&another=2 -http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 http://www.foo.com/foo.php?x=1&something=1 -http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 http://www.foo.com/foo.html -http://www.foo.com/foo.php?_sessionid=qmyrcedt&outputformat=html&path=/3_images/foo http://www.foo.com/foo.php?outputformat=html&path=/3_images/foo -http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en&_sessionid=e36902d5bb2d0d922fc24b43 http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en -http://www.foo.com/foo.php?app=content&content=overview&lang=en&_sid=587fba8f825b05844526519fdb7d75c8&b=35&m=47 http://www.foo.com/foo.php?app=content&content=overview&lang=en&b=35&m=47 -# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328) -http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 -http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 - -# test removal default pages -http://www.foo.com/home/index.html http://www.foo.com/home/ -http://www.foo.com/index.html http://www.foo.com/ -http://www.foo.com/index.htm http://www.foo.com/ -http://www.foo.com/index.asp http://www.foo.com/ -http://www.foo.com/index.aspx http://www.foo.com/ -http://www.foo.com/index.php http://www.foo.com/ -http://www.foo.com/index.php3 http://www.foo.com/ -http://www.foo.com/default.html http://www.foo.com/ -http://www.foo.com/default.htm http://www.foo.com/ -http://www.foo.com/default.asp http://www.foo.com/ -http://www.foo.com/default.aspx http://www.foo.com/ -http://www.foo.com/default.php http://www.foo.com/ -http://www.foo.com/default.php3 http://www.foo.com/ -http://www.foo.com/something.php3 http://www.foo.com/something.php3 -http://www.foo.com/something.html http://www.foo.com/something.html -http://www.foo.com/something.asp http://www.foo.com/something.asp -http://www.foo.com/index.phtml http://www.foo.com/ -http://www.foo.com/index.cfm http://www.foo.com/ -http://www.foo.com/index.cgi http://www.foo.com/ -http://www.foo.com/index.HTML http://www.foo.com/ -http://www.foo.com/index.Htm http://www.foo.com/ -http://www.foo.com/index.ASP http://www.foo.com/ -http://www.foo.com/index.jsp http://www.foo.com/ -http://www.foo.com/index.jsf http://www.foo.com/ -http://www.foo.com/index.jspx http://www.foo.com/ -http://www.foo.com/index.jspfx http://www.foo.com/index.jspfx -http://www.foo.com/index.jspa http://www.foo.com/ -http://www.foo.com/index.jsps http://www.foo.com/index.jsps -http://www.foo.com/index.aspX http://www.foo.com/ -http://www.foo.com/index.PhP http://www.foo.com/ -http://www.foo.com/index.PhP4 http://www.foo.com/ -http://www.foo.com/default.HTml http://www.foo.com/ -http://www.foo.com/default.HTm http://www.foo.com/ -http://www.foo.com/default.ASp http://www.foo.com/ -http://www.foo.com/default.AspX http://www.foo.com/ -http://www.foo.com/default.PHP http://www.foo.com/ -http://www.foo.com/default.PHP3 http://www.foo.com/ -http://www.foo.com/index.phtml http://www.foo.com/ -http://www.foo.com/index.cfm http://www.foo.com/ -http://www.foo.com/index.cgi http://www.foo.com/ - -# ensure keeping non-default pages -http://www.foo.com/foo.php3 http://www.foo.com/foo.php3 -http://www.foo.com/foo.html http://www.foo.com/foo.html -http://www.foo.com/foo.asp http://www.foo.com/foo.asp - -# test removal of interpage anchors and keeping query string -http://www.foo.com/foo.html#something http://www.foo.com/foo.html -http://www.foo.com/foo.html#something?x=y http://www.foo.com/foo.html?x=y - -# test general cleaning of bad urls -http://www.foo.com/foo.html?&x=y http://www.foo.com/foo.html?x=y -http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a -http://www.foo.com/foo.html? http://www.foo.com/foo.html - -# remove double slashes but keep 2 slashes after protocol -http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html -https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html - -# normalize file: protocol prefix (keep one slash) -file:///path//foo.html file:/path/foo.html -file:/path//foo.html file:/path/foo.html http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml deleted file mode 100644 index 4d6eabc..0000000 --- a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml +++ /dev/null @@ -1,66 +0,0 @@ -<?xml version="1.0"?> -<!-- This is the configuration file for the RegexUrlNormalize Class. - This is intended so that users can specify substitutions to be - done on URLs. The regex engine that is used is Perl5 compatible. - The rules are applied to URLs in the order they occur in this file. --> - -<!-- WATCH OUT: an xml parser reads this file an ampersands must be - expanded to & --> - -<!-- The following rules show how to strip out session IDs, default pages, - interpage anchors, etc. Order does matter! --> -<regex-normalize> - -<!-- removes session ids from urls (such as jsessionid and PHPSESSID) --> -<regex> - <pattern>(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern> - <substitution>$4</substitution> -</regex> - -<!-- changes default pages into standard for /index.html, etc. into / --> -<!-- these are commented in the default file but uncommented here for testing --> -<regex> - <pattern>/((?i)index|default)\.((?i)js[pf]{1}?[afx]?|cgi|cfm|asp[x]?|[psx]?htm[l]?|php[3456]?)(\?|&|#|$)</pattern> - <substitution>/$3</substitution> -</regex> - -<!-- removes interpage href anchors such as site.com#location --> -<regex> - <pattern>#.*?(\?|&|$)</pattern> - <substitution>$1</substitution> -</regex> - -<!-- cleans ?&var=value into ?var=value --> -<regex> - <pattern>\?&</pattern> - <substitution>\?</substitution> -</regex> - -<!-- cleans multiple sequential ampersands into a single ampersand --> -<regex> - <pattern>&{2,}</pattern> - <substitution>&</substitution> -</regex> - -<!-- removes trailing ?, ampersands, . --> -<regex> - <pattern>[\?&\.]$</pattern> - <substitution></substitution> -</regex> - -<!-- normalize file:/// protocol prefix: --> -<!-- keep one single slash (NUTCH-1483) --> -<regex> - <pattern>^file://+</pattern> - <substitution>file:/</substitution> -</regex> - -<!-- removes duplicate slashes but --> -<!-- * allow 2 slashes after colon ':' (indicating protocol) --> -<regex> - <pattern>(?<!:)/{2,}</pattern> - <substitution>/</substitution> -</regex> - -</regex-normalize> - http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test deleted file mode 100644 index 9d92880..0000000 --- a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test +++ /dev/null @@ -1,8 +0,0 @@ -# test removal of subdomains -http://www.foo.bar.com/ http://bar.com/ - -# test removal of url path -http://www.foo.bar.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://bar.com/ - -# test removal of urls in arguments -https://www.foo.bar.com/foo.php?url=http://www.example.com/test.php https://bar.com/ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml deleted file mode 100644 index 3698968..0000000 --- a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml +++ /dev/null @@ -1,21 +0,0 @@ -<?xml version="1.0"?> -<!-- This is the configuration file for the RegexUrlNormalize Class. - This is intended so that users can specify substitutions to be - done on URLs. The regex engine that is used is Perl5 compatible. - The rules are applied to URLs in the order they occur in this file. --> - -<!-- WATCH OUT: an xml parser reads this file an ampersands must be - expanded to & --> - -<!-- - The following rules show how to reduce urls so that - urls from the same domain are identical. This is useful - e.g. when calculating host counts, or splitting fetchlists. ---> -<regex-normalize> -<regex> - <pattern>(^[a-z]{3,5}://)([\w]+\.)*?(\w+\.\w+)[/$].*</pattern> - <substitution>$1$3/</substitution> -</regex> -</regex-normalize> - http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.test ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.test b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.test new file mode 100644 index 0000000..7867ad8 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.test @@ -0,0 +1,84 @@ +# test simple removal of session id, keeping parameters before and after +http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php +http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php?f=2 +http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3 http://foo.com/foo.php?f=2&q=3 +http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2 http://foo.com/foo.php?f=2 + +# test removal of different session ids including removal of ; in jsessionid +http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl http://www.foo.com/foo.php +http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl&x=y http://www.foo.com/foo.php?x=y +http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html +http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html?param=1&another=2 +http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2 http://www.foo.com/foo.html?param=1&another=2 +http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 http://www.foo.com/foo.php?x=1&something=1 +http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 http://www.foo.com/foo.html +http://www.foo.com/foo.php?_sessionid=qmyrcedt&outputformat=html&path=/3_images/foo http://www.foo.com/foo.php?outputformat=html&path=/3_images/foo +http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en&_sessionid=e36902d5bb2d0d922fc24b43 http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en +http://www.foo.com/foo.php?app=content&content=overview&lang=en&_sid=587fba8f825b05844526519fdb7d75c8&b=35&m=47 http://www.foo.com/foo.php?app=content&content=overview&lang=en&b=35&m=47 +# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328) +http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 +http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 + +# test removal default pages +http://www.foo.com/home/index.html http://www.foo.com/home/ +http://www.foo.com/index.html http://www.foo.com/ +http://www.foo.com/index.htm http://www.foo.com/ +http://www.foo.com/index.asp http://www.foo.com/ +http://www.foo.com/index.aspx http://www.foo.com/ +http://www.foo.com/index.php http://www.foo.com/ +http://www.foo.com/index.php3 http://www.foo.com/ +http://www.foo.com/default.html http://www.foo.com/ +http://www.foo.com/default.htm http://www.foo.com/ +http://www.foo.com/default.asp http://www.foo.com/ +http://www.foo.com/default.aspx http://www.foo.com/ +http://www.foo.com/default.php http://www.foo.com/ +http://www.foo.com/default.php3 http://www.foo.com/ +http://www.foo.com/something.php3 http://www.foo.com/something.php3 +http://www.foo.com/something.html http://www.foo.com/something.html +http://www.foo.com/something.asp http://www.foo.com/something.asp +http://www.foo.com/index.phtml http://www.foo.com/ +http://www.foo.com/index.cfm http://www.foo.com/ +http://www.foo.com/index.cgi http://www.foo.com/ +http://www.foo.com/index.HTML http://www.foo.com/ +http://www.foo.com/index.Htm http://www.foo.com/ +http://www.foo.com/index.ASP http://www.foo.com/ +http://www.foo.com/index.jsp http://www.foo.com/ +http://www.foo.com/index.jsf http://www.foo.com/ +http://www.foo.com/index.jspx http://www.foo.com/ +http://www.foo.com/index.jspfx http://www.foo.com/index.jspfx +http://www.foo.com/index.jspa http://www.foo.com/ +http://www.foo.com/index.jsps http://www.foo.com/index.jsps +http://www.foo.com/index.aspX http://www.foo.com/ +http://www.foo.com/index.PhP http://www.foo.com/ +http://www.foo.com/index.PhP4 http://www.foo.com/ +http://www.foo.com/default.HTml http://www.foo.com/ +http://www.foo.com/default.HTm http://www.foo.com/ +http://www.foo.com/default.ASp http://www.foo.com/ +http://www.foo.com/default.AspX http://www.foo.com/ +http://www.foo.com/default.PHP http://www.foo.com/ +http://www.foo.com/default.PHP3 http://www.foo.com/ +http://www.foo.com/index.phtml http://www.foo.com/ +http://www.foo.com/index.cfm http://www.foo.com/ +http://www.foo.com/index.cgi http://www.foo.com/ + +# ensure keeping non-default pages +http://www.foo.com/foo.php3 http://www.foo.com/foo.php3 +http://www.foo.com/foo.html http://www.foo.com/foo.html +http://www.foo.com/foo.asp http://www.foo.com/foo.asp + +# test removal of interpage anchors and keeping query string +http://www.foo.com/foo.html#something http://www.foo.com/foo.html +http://www.foo.com/foo.html#something?x=y http://www.foo.com/foo.html?x=y + +# test general cleaning of bad urls +http://www.foo.com/foo.html?&x=y http://www.foo.com/foo.html?x=y +http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a +http://www.foo.com/foo.html? http://www.foo.com/foo.html + +# remove double slashes but keep 2 slashes after protocol +http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html +https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html + +# normalize file: protocol prefix (keep one slash) +file:///path//foo.html file:/path/foo.html +file:/path//foo.html file:/path/foo.html http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.xml b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.xml new file mode 100644 index 0000000..4d6eabc --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-default.xml @@ -0,0 +1,66 @@ +<?xml version="1.0"?> +<!-- This is the configuration file for the RegexUrlNormalize Class. + This is intended so that users can specify substitutions to be + done on URLs. The regex engine that is used is Perl5 compatible. + The rules are applied to URLs in the order they occur in this file. --> + +<!-- WATCH OUT: an xml parser reads this file an ampersands must be + expanded to & --> + +<!-- The following rules show how to strip out session IDs, default pages, + interpage anchors, etc. Order does matter! --> +<regex-normalize> + +<!-- removes session ids from urls (such as jsessionid and PHPSESSID) --> +<regex> + <pattern>(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern> + <substitution>$4</substitution> +</regex> + +<!-- changes default pages into standard for /index.html, etc. into / --> +<!-- these are commented in the default file but uncommented here for testing --> +<regex> + <pattern>/((?i)index|default)\.((?i)js[pf]{1}?[afx]?|cgi|cfm|asp[x]?|[psx]?htm[l]?|php[3456]?)(\?|&|#|$)</pattern> + <substitution>/$3</substitution> +</regex> + +<!-- removes interpage href anchors such as site.com#location --> +<regex> + <pattern>#.*?(\?|&|$)</pattern> + <substitution>$1</substitution> +</regex> + +<!-- cleans ?&var=value into ?var=value --> +<regex> + <pattern>\?&</pattern> + <substitution>\?</substitution> +</regex> + +<!-- cleans multiple sequential ampersands into a single ampersand --> +<regex> + <pattern>&{2,}</pattern> + <substitution>&</substitution> +</regex> + +<!-- removes trailing ?, ampersands, . --> +<regex> + <pattern>[\?&\.]$</pattern> + <substitution></substitution> +</regex> + +<!-- normalize file:/// protocol prefix: --> +<!-- keep one single slash (NUTCH-1483) --> +<regex> + <pattern>^file://+</pattern> + <substitution>file:/</substitution> +</regex> + +<!-- removes duplicate slashes but --> +<!-- * allow 2 slashes after colon ':' (indicating protocol) --> +<regex> + <pattern>(?<!:)/{2,}</pattern> + <substitution>/</substitution> +</regex> + +</regex-normalize> + http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.test ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.test b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.test new file mode 100644 index 0000000..9d92880 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.test @@ -0,0 +1,8 @@ +# test removal of subdomains +http://www.foo.bar.com/ http://bar.com/ + +# test removal of url path +http://www.foo.bar.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://bar.com/ + +# test removal of urls in arguments +https://www.foo.bar.com/foo.php?url=http://www.example.com/test.php https://bar.com/ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.xml b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.xml new file mode 100644 index 0000000..3698968 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/src/test/resources/regex-normalize-scope1.xml @@ -0,0 +1,21 @@ +<?xml version="1.0"?> +<!-- This is the configuration file for the RegexUrlNormalize Class. + This is intended so that users can specify substitutions to be + done on URLs. The regex engine that is used is Perl5 compatible. + The rules are applied to URLs in the order they occur in this file. --> + +<!-- WATCH OUT: an xml parser reads this file an ampersands must be + expanded to & --> + +<!-- + The following rules show how to reduce urls so that + urls from the same domain are identical. This is useful + e.g. when calculating host counts, or splitting fetchlists. +--> +<regex-normalize> +<regex> + <pattern>(^[a-z]{3,5}://)([\w]+\.)*?(\w+\.\w+)[/$].*</pattern> + <substitution>$1$3/</substitution> +</regex> +</regex-normalize> + http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-slash/data/slashes.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-slash/data/slashes.txt b/nutch-plugins/urlnormalizer-slash/data/slashes.txt deleted file mode 100644 index d3bd70a..0000000 --- a/nutch-plugins/urlnormalizer-slash/data/slashes.txt +++ /dev/null @@ -1,7 +0,0 @@ -# Both domains have duplicate URL's, some with slashes and some without - -# We prefer this domain with slashes -www.example.org + - -# ..but this domain without -www.example.net - \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlnormalizer-slash/src/test/resources/slashes.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-slash/src/test/resources/slashes.txt b/nutch-plugins/urlnormalizer-slash/src/test/resources/slashes.txt new file mode 100644 index 0000000..d3bd70a --- /dev/null +++ b/nutch-plugins/urlnormalizer-slash/src/test/resources/slashes.txt @@ -0,0 +1,7 @@ +# Both domains have duplicate URL's, some with slashes and some without + +# We prefer this domain with slashes +www.example.org + + +# ..but this domain without +www.example.net - \ No newline at end of file
