http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/src/test/resources/nutch.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/nutch.html b/nutch-plugins/parse-tika/src/test/resources/nutch.html new file mode 100644 index 0000000..0aa7c98 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/resources/nutch.html @@ -0,0 +1,519 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html> +<head> +<META http-equiv="Content-Type" content="text/html; charset=UTF-8"> +<meta content="Apache Forrest" name="Generator"> +<meta name="Forrest-version" content="0.8"> +<meta name="Forrest-skin-name" content="lucene"> +<title>Welcome to Nutch!</title> +<link type="text/css" href="skin/basic.css" rel="stylesheet"> +<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet"> +<link media="print" type="text/css" href="skin/print.css" rel="stylesheet"> +<link type="text/css" href="skin/profile.css" rel="stylesheet"> +<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script> +<link rel="shortcut icon" href="images/favicon.ico"> +</head> +<body onload="init()"> +<script type="text/javascript">ndeSetTextSize();</script> +<div id="top"> +<!--+ + |breadtrail + +--> +<div class="breadtrail"> +<a href="http://www.apache.org/">Apache</a> > <a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script> +</div> +<!--+ + |header + +--> +<div class="header"> +<!--+ + |start group logo + +--> +<div class="grouplogo"> +<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a> +</div> +<!--+ + |end group logo + +--> +<!--+ + |start Project Logo + +--> +<div class="projectlogo"> +<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a> +</div> +<!--+ + |end Project Logo + +--> +<!--+ + |start Search + +--> +<div class="searchbox"> +<form action="http://search.lucidimagination.com/p:nutch" method="get" class="roundtopsmall"> +<input onFocus="getBlank (this, 'Search the site with Solr');" size="25" name="q" id="query" type="text" value="Search the site with Solr"> + <input name="Search" value="Search" type="submit"> +</form> +<div style="position: relative; top: -5px; left: -10px">Powered by <a href="http://www.lucidimagination.com" style="color: #033268">Lucid Imagination</a> +</div> +</div> +<!--+ + |end search + +--> +<!--+ + |start Tabs + +--> +<ul id="tabs"> +<li class="current"> +<a class="selected" href="index.html">Main</a> +</li> +<li> +<a class="unselected" href="http://wiki.apache.org/nutch/">Wiki</a> +</li> +<li> +<a class="unselected" href="http://issues.apache.org/jira/browse/Nutch">Jira</a> +</li> +</ul> +<!--+ + |end Tabs + +--> +</div> +</div> +<div id="main"> +<div id="publishedStrip"> +<!--+ + |start Subtabs + +--> +<div id="level2tabs"></div> +<!--+ + |end Endtabs + +--> +<script type="text/javascript"><!-- +document.write("Last Published: " + document.lastModified); +// --></script> +</div> +<!--+ + |breadtrail + +--> +<div class="breadtrail"> + + + </div> +<!--+ + |start Menu, mainarea + +--> +<!--+ + |start Menu + +--> +<div id="menu"> +<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Project</div> +<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;"> +<div class="menupage"> +<div class="menupagetitle">News</div> +</div> +<div class="menuitem"> +<a href="about.html">About</a> +</div> +<div class="menuitem"> +<a href="credits.html">Credits</a> +</div> +<div class="menuitem"> +<a href="http://www.cafepress.com/nutch/">Buy Stuff</a> +</div> +</div> +<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div> +<div id="menu_1.2" class="menuitemgroup"> +<div class="menuitem"> +<a href="http://wiki.apache.org/nutch/FAQ">FAQ</a> +</div> +<div class="menuitem"> +<a href="http://wiki.apache.org/nutch/">Wiki</a> +</div> +<div class="menuitem"> +<a href="tutorial.html">Tutorial (0.7.2)</a> +</div> +<div class="menuitem"> +<a href="tutorial8.html">Tutorial (0.8.x)</a> +</div> +<div class="menuitem"> +<a href="bot.html">Robot </a> +</div> +<div class="menuitem"> +<a href="i18n.html">i18n</a> +</div> +<div class="menuitem"> +<a href="apidocs-1.0/index.html">API Docs (1.0)</a> +</div> +<div class="menuitem"> +<a href="apidocs-0.9/index.html">API Docs (0.9)</a> +</div> +<div class="menuitem"> +<a href="apidocs-0.8.x/index.html">API Docs (0.8.x)</a> +</div> +<div class="menuitem"> +<a href="apidocs/index.html">API Docs (0.7.2)</a> +</div> +<div class="menuitem"> +<a href="http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html">API Docs (nightly)</a> +</div> +</div> +<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div> +<div id="menu_1.3" class="menuitemgroup"> +<div class="menuitem"> +<a href="release/">Download</a> +</div> +<div class="menuitem"> +<a href="nightly.html">Nightly builds</a> +</div> +<div class="menuitem"> +<a href="mailing_lists.html">Mailing Lists</a> +</div> +<div class="menuitem"> +<a href="issue_tracking.html">Issue Tracking</a> +</div> +<div class="menuitem"> +<a href="version_control.html">Version Control</a> +</div> +</div> +<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div> +<div id="menu_1.4" class="menuitemgroup"> +<div class="menuitem"> +<a href="http://lucene.apache.org/java/">Lucene Java</a> +</div> +<div class="menuitem"> +<a href="http://lucene.apache.org/hadoop/">Hadoop</a> +</div> +<div class="menuitem"> +<a href="http://incubator.apache.org/solr/">Solr</a> +</div> +</div> +<div id="credit"> +<hr> +<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a> +</div> +<div id="roundbottom"> +<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div> +<!--+ + |alternative credits + +--> +<div id="credit2"></div> +</div> +<!--+ + |end Menu + +--> +<!--+ + |start content + +--> +<div id="content"> +<div title="Portable Document Format" class="pdflink"> +<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br> + PDF</a> +</div> +<h1>Welcome to Nutch!</h1> +<div id="minitoc-area"> +<ul class="minitoc"> +<li> +<a href="#News">News</a> +<ul class="minitoc"> +<li> +<a href="#14+August+2009+-+Lucene+at+US+ApacheCon">14 August 2009 - Lucene at US ApacheCon</a> +</li> +<li> +<a href="#23+March+2009+-+Apache+Nutch+1.0+Released">23 March 2009 - Apache Nutch 1.0 Released</a> +</li> +<li> +<a href="#09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam">09 February 2009 - Lucene at ApacheCon Europe 2009 in + Amsterdam</a> +</li> +<li> +<a href="#2+April+2007%3A+Nutch+0.9+Released">2 April 2007: Nutch 0.9 Released</a> +</li> +<li> +<a href="#24+September+2006%3A+Nutch+0.8.1+Released">24 September 2006: Nutch 0.8.1 Released</a> +</li> +<li> +<a href="#25+July+2006%3A+Nutch+0.8+Released">25 July 2006: Nutch 0.8 Released</a> +</li> +<li> +<a href="#31+March+2006%3A+Nutch+0.7.2+Released">31 March 2006: Nutch 0.7.2 Released</a> +</li> +<li> +<a href="#1+October+2005%3A+Nutch+0.7.1+Released">1 October 2005: Nutch 0.7.1 Released</a> +</li> +<li> +<a href="#17+August+2005%3A+Nutch+0.7+Released">17 August 2005: Nutch 0.7 Released</a> +</li> +<li> +<a href="#June+2005%3A+Nutch+graduates+from+Incubator">June 2005: Nutch graduates from Incubator</a> +</li> +<li> +<a href="#January+2005%3A+Nutch+Joins+Apache+Incubator">January 2005: Nutch Joins Apache Incubator</a> +</li> +<li> +<a href="#September+2004%3A+Creative+Commons+launches+Nutch-based+Search">September 2004: Creative Commons launches Nutch-based Search</a> +</li> +<li> +<a href="#September+2004%3A+Oregon+State+University+switches+to+Nutch">September 2004: Oregon State University switches to Nutch</a> +</li> +</ul> +</li> +</ul> +</div> + + +<a name="N1000D"></a><a name="News"></a> +<h2 class="h3">News</h2> +<div class="section"> +<a name="N10013"></a><a name="14+August+2009+-+Lucene+at+US+ApacheCon"></a> +<h3 class="h4">14 August 2009 - Lucene at US ApacheCon</h3> +<p> + +<a href="http://www.us.apachecon.com/c/acus2009/" title="ApacheCon US 2009"> + <img alt="ApacheCon Logo" class="float-right" src="http://www.apache.org/events/current-event-125x125.png"> + </a> + ApacheCon US is once again in the Bay Area and Lucene is coming + along for the ride! The Lucene community has planned two full + days of talks, plus a meetup and the usual bevy of training. + With a well-balanced mix of first time and veteran ApacheCon + speakers, the + <a href="http://www.us.apachecon.com/c/acus2009/schedule#lucene">Lucene track</a> + at ApacheCon US promises to have something for everyone. Be sure + not to miss: + </p> +<p> Training:</p> +<ul> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/437">Lucene Boot Camp</a> + - A two day training session, Nov. 2nd & 3rd + </li> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/375">Solr Day</a> + - A one day training session, Nov. 2nd + </li> + +</ul> +<p>Thursday, Nov. 5th</p> +<ul> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/428">Introduction to the Lucene Ecosystem + </a> + - Grant Ingersoll @ 9:00 + </li> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/461">Lucene Basics and New Features</a> + - Michael Busch @ 10:00 + </li> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/331">Apache Solr: Out of the Box</a> + - Chris Hostetter @ 14:00 + </li> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/427">Introduction to Nutch</a> + - Andrzej Bialecki @ 15:00 + </li> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/430">Lucene and Solr Performance Tuning</a> + - Mark Miller @ 16:30 + </li> + +</ul> +<p>Friday, Nov. 6th</p> +<ul> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/332">Implementing an Information Retrieval + Framework for an Organizational Repository</a> + - Sithu D Sudarsan @ 9:00 + </li> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/333">Apache Mahout - Going from raw data to + Information</a> + - Isabel Drost @ 10:00 + </li> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/334">MIME Magic with Apache Tika</a> + - Jukka Zitting @ 11:30 + </li> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/335">Building Intelligent Search Applications + with the Lucene Ecosystem</a> + - Ted Dunning @ 14:00 + </li> + +<li> + +<a href="http://www.us.apachecon.com/c/acus2009/sessions/462">Realtime Search</a> + - Jason Rutherglen @ 15:00 + </li> + +</ul> +<a name="N10091"></a><a name="23+March+2009+-+Apache+Nutch+1.0+Released"></a> +<h3 class="h4">23 March 2009 - Apache Nutch 1.0 Released</h3> +<p>The 1.0 release of Nutch is now available. This release includes several major feature improvements + such as new indexing framework, new scoring framework, Apache Solr integration just to mention a few. + See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-1.0.txt"> + list of changes</a> made in this version. The release is available + <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> +<a name="N100A3"></a><a name="09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam"></a> +<h3 class="h4">09 February 2009 - Lucene at ApacheCon Europe 2009 in + Amsterdam</h3> +<p> + +<a href="http://www.eu.apachecon.com/c/aceu2009/" title="ApacheCon EU 2009"> + <img alt="ApacheCon EU 2009 Logo" class="float-right" src="http://www.eu.apachecon.com/page_attachments/0000/0115/125x125_basic.gif"> + </a> + + Lucene will be extremely well represented at + <a href="http://www.eu.apachecon.com/c/aceu2009/">ApacheCon EU 2009</a> + in Amsterdam, Netherlands this March 23-27, 2009: + </p> +<ul> + +<li> + +<a href="http://eu.apachecon.com/c/aceu2009/sessions/197">Lucene Boot Camp</a> + - A two day training session, March 23 & 24th</li> + +<li> +<a href="http://eu.apachecon.com/c/aceu2009/sessions/201">Solr Boot Camp</a> - A one day training session, March 24th</li> + +<li> +<a href="http://eu.apachecon.com/c/aceu2009/sessions/136">Introducing Apache Mahout</a> - Grant Ingersoll. March 25th @ 10:30</li> + +<li> +<a href="http://eu.apachecon.com/c/aceu2009/sessions/137">Lucene/Solr Case Studies</a> - Erik Hatcher. March 25th @ 11:30</li> + +<li> +<a href="http://eu.apachecon.com/c/aceu2009/sessions/138">Advanced Indexing Techniques with Apache Lucene</a> - Michael Busch. March 25th @ 14:00</li> + +<li> +<a href="http://eu.apachecon.com/c/aceu2009/sessions/251">Apache Solr - A Case Study</a> - Uri Boness. March 26th @ 17:30</li> + +<li> +<a href="http://eu.apachecon.com/c/aceu2009/sessions/250">Best of breed - httpd, forrest, solr and droids</a> - Thorsten Scherler. March 27th @ 17:30</li> + +<li> +<a href="http://eu.apachecon.com/c/aceu2009/sessions/165">Apache Droids - an intelligent standalone robot framework</a> - Thorsten Scherler. March 26th @ 15:00</li> + + +</ul> +<a name="N100EF"></a><a name="2+April+2007%3A+Nutch+0.9+Released"></a> +<h3 class="h4">2 April 2007: Nutch 0.9 Released</h3> +<p>The 0.9 release of Nutch is now available. This is the second release of Nutch + based entirely on the underlying Hadoop platform. This release includes several critical + bug fixes, as well as key speedups described in more detail at + <a href="http://blog.foofactory.fi/2007/03/twice-speed-half-size.html">Sami Siren's blog</a>. + See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.9.txt"> + list of changes</a> made in this version. The release is available + <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> +<a name="N10105"></a><a name="24+September+2006%3A+Nutch+0.8.1+Released"></a> +<h3 class="h4">24 September 2006: Nutch 0.8.1 Released</h3> +<p>The 0.8.1 release of Nutch is now available. This is a maintenance release to 0.8 branch fixing many serous bugs found in version 0.8. + See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.8.1.txt"> + list of changes</a> made in this version. The release is available + <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> +<a name="N10117"></a><a name="25+July+2006%3A+Nutch+0.8+Released"></a> +<h3 class="h4">25 July 2006: Nutch 0.8 Released</h3> +<p>The 0.8 release of Nutch is now available. This is the first release of Nutch based on + hadoop architecure. See <a href="http://svn.apache.org/viewvc/lucene/nutch/tags/release-0.8/CHANGES.txt?view=markup"> + CHANGES.txt</a> for list of changes made in this version. The release is available + <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> +<a name="N10129"></a><a name="31+March+2006%3A+Nutch+0.7.2+Released"></a> +<h3 class="h4">31 March 2006: Nutch 0.7.2 Released</h3> +<p>The 0.7.2 release of Nutch is now available. This is a bug fix release for 0.7 branch. See + <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=390158"> + CHANGES.txt</a> for details. The release is available + <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> +<a name="N1013B"></a><a name="1+October+2005%3A+Nutch+0.7.1+Released"></a> +<h3 class="h4">1 October 2005: Nutch 0.7.1 Released</h3> +<p>The 0.7.1 release of Nutch is now available. This is a bug fix release. See + <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=292986"> + CHANGES.txt</a> for details. The release is available + <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> +<a name="N1014D"></a><a name="17+August+2005%3A+Nutch+0.7+Released"></a> +<h3 class="h4">17 August 2005: Nutch 0.7 Released</h3> +<p>This is the first Nutch release as an Apache Lucene sub-project. See + <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/CHANGES.txt?rev=233150"> + CHANGES.txt</a> for details. The release is available + <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> +<a name="N1015F"></a><a name="June+2005%3A+Nutch+graduates+from+Incubator"></a> +<h3 class="h4">June 2005: Nutch graduates from Incubator</h3> +<p>Nutch has now graduated from the Apache incubator, and is now + a Subproject of Lucene.</p> +<a name="N10169"></a><a name="January+2005%3A+Nutch+Joins+Apache+Incubator"></a> +<h3 class="h4">January 2005: Nutch Joins Apache Incubator</h3> +<p>Nutch is a two-year-old open source project, previously + hosted at Sourceforge and backed by its own non-profit + organization. The non-profit was founded in order to assign + copyright, so that we could retain the right to change the + license. We have now determined that the Apache license is the + appropriate license for Nutch and no longer require the + overhead of an independent non-profit organization. Nutch's + board of directors and its developers were both polled and + supported the move to the Apache foundation.</p> +<a name="N10173"></a><a name="September+2004%3A+Creative+Commons+launches+Nutch-based+Search"></a> +<h3 class="h4">September 2004: Creative Commons launches Nutch-based Search</h3> +<p>Creative Commons unveiled a beta version of its search + engine, which scours the web for text, images, audio, and video + free to re-use on certain terms a search refinement offered by + no other company or organization.</p> +<p>See the <a href="http://creativecommons.org/press-releases/entry/5064">Creative + Commons Press Release</a> for more details.</p> +<a name="N10184"></a><a name="September+2004%3A+Oregon+State+University+switches+to+Nutch"></a> +<h3 class="h4">September 2004: Oregon State University switches to Nutch</h3> +<p>Oregon State University is converting its searching + infrastructure from Googletm to the open source project + Nutch. The effort to replace the Googletm will realize + significant cost savings for Oregon State University, while + promoting both the Nutch Search Engine and transparency in + search engine use and management.</p> +<p>For more details see the announcement by OSU's <a href="http://osuosl.org/news_folder/nutch">Open Source + Lab</a>.</p> +</div> + + +</div> +<!--+ + |end content + +--> +<div class="clearboth"> </div> +</div> +<div id="footer"> +<!--+ + |start bottomstrip + +--> +<div class="lastmodified"> +<script type="text/javascript"><!-- +document.write("Last Published: " + document.lastModified); +// --></script> +</div> +<div class="copyright"> + Copyright © + 2006 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a> +</div> +<div id="logos"></div> +<!--+ + |end bottomstrip + +--> +</div> +</body> +</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif b/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif new file mode 100644 index 0000000..0545a60 Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/src/test/resources/ootest.odt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/ootest.odt b/nutch-plugins/parse-tika/src/test/resources/ootest.odt new file mode 100644 index 0000000..e36e389 Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/ootest.odt differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/src/test/resources/ootest.sxw ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/ootest.sxw b/nutch-plugins/parse-tika/src/test/resources/ootest.sxw new file mode 100644 index 0000000..260b1c2 Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/ootest.sxw differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/src/test/resources/ootest.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/ootest.txt b/nutch-plugins/parse-tika/src/test/resources/ootest.txt new file mode 100644 index 0000000..685f89a --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/resources/ootest.txt @@ -0,0 +1,30 @@ +Abcedfg ????? +Abcdefg +Abcdefg +abcdefg + + + + + + + + + + + http://www.openoffice.org + +Title +Col1 +Col2 +Col3 +head +Cell1 +Cell2 +Cel3 +total +TOTAL +TOTAL +TOTAL + +Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Integer a leo in lacus malesuada ornare. Mauris sagittis. Nam vestibulum. Nunc gravida vestibulum augue. Praesent sed lectus quis lectus adipiscing bibendum. Sed nulla. Duis posuere justo eget urna. Proin lorem orci, vestibulum ut, consequat molestie, eleifend a, nibh. Mauris sed lacus. Etiam blandit tincidunt neque. Cras ac sapien. Duis erat. http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf b/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf new file mode 100644 index 0000000..e7c6e62 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf @@ -0,0 +1,157 @@ +%PDF-1.2 +%���� + +9 0 obj +<< +/Length 10 0 R +/Filter /FlateDecode +>> +stream +H�Í�J�0�� ��{��f�$M��n�-���[&je���ۤ �~�$���}ï¿½É ï¿½Ij���s����~�X�-],��$Y���)�'N�u�1!���V�?��? +�b1Rbb�Ò�H�[��TD:#�&Øï¿½ï¿½X���i�$qnf�����]������a��{��أ���q|J�Ls]�Q�I��j�%��9��`�঺��U�ite�z�$����OeB�Äү�R��@zÜ���g���<��� +endstream +endobj +10 0 obj +246 +endobj +4 0 obj +<< +/Type /Page +/Parent 5 0 R +/Resources << +/Font << +/F0 6 0 R +/F1 7 0 R +>> +/ProcSet 2 0 R +>> +/Contents 9 0 R +>> +endobj +6 0 obj +<< +/Type /Font +/Subtype /TrueType +/Name /F0 +/BaseFont /Arial +/Encoding /WinAnsiEncoding +>> +endobj +7 0 obj +<< +/Type /Font +/Subtype /TrueType +/Name /F1 +/BaseFont /BookAntiqua,Bold +/FirstChar 31 +/LastChar 255 +/Widths [ 750 250 278 402 606 500 889 833 227 333 333 444 606 250 333 250 +296 500 500 500 500 500 500 500 500 500 500 250 250 606 606 606 +444 747 778 667 722 833 611 556 833 833 389 389 778 611 1000 833 +833 611 833 722 611 667 778 778 1000 667 667 667 333 606 333 606 +500 333 500 611 444 611 500 389 556 611 333 333 611 333 889 611 +556 611 611 389 444 333 611 556 833 500 556 500 310 606 310 606 +750 500 750 333 500 500 1000 500 500 333 1000 611 389 1000 750 750 +750 750 278 278 500 500 606 500 1000 333 998 444 389 833 750 750 +667 250 278 500 500 606 500 606 500 333 747 438 500 606 333 747 +500 400 549 361 361 333 576 641 250 333 361 488 500 889 890 889 +444 778 778 778 778 778 778 1000 722 611 611 611 611 389 389 389 +389 833 833 833 833 833 833 833 606 833 778 778 778 778 667 611 +611 500 500 500 500 500 500 778 444 500 500 500 500 333 333 333 +333 556 611 556 556 556 556 556 549 556 611 611 611 611 556 611 +556 ] +/Encoding /WinAnsiEncoding +/FontDescriptor 8 0 R +>> +endobj +8 0 obj +<< +/Type /FontDescriptor +/FontName /BookAntiqua,Bold +/Flags 16418 +/FontBBox [ -250 -260 1236 930 ] +/MissingWidth 750 +/StemV 146 +/StemH 146 +/ItalicAngle 0 +/CapHeight 930 +/XHeight 651 +/Ascent 930 +/Descent 260 +/Leading 210 +/MaxWidth 1030 +/AvgWidth 460 +>> +endobj +2 0 obj +[ /PDF /Text ] +endobj +5 0 obj +<< +/Kids [4 0 R ] +/Count 1 +/Type /Pages +/MediaBox [ 0 0 612 792 ] +>> +endobj +1 0 obj +<< +/Creator (1725.fm) +/CreationDate (1-Jan-3 18:15PM) +/Title (1725.PDF) +/Author (Unknown) +/Producer (Acrobat PDFWriter 3.02 for Windows) +/Keywords () +/Subject () +>> +endobj +3 0 obj +<< +/Pages 5 0 R +/Type /Catalog +/DefaultGray 11 0 R +/DefaultRGB 12 0 R +>> +endobj +11 0 obj +[/CalGray +<< +/WhitePoint [0.9505 1 1.0891 ] +/Gamma 0.2468 +>> +] +endobj +12 0 obj +[/CalRGB +<< +/WhitePoint [0.9505 1 1.0891 ] +/Gamma [0.2468 0.2468 0.2468 ] +/Matrix [0.4361 0.2225 0.0139 0.3851 0.7169 0.0971 0.1431 0.0606 0.7141 ] +>> +] +endobj +xref +0 13 +0000000000 65535 f +0000002172 00000 n +0000002046 00000 n +0000002363 00000 n +0000000375 00000 n +0000002080 00000 n +0000000518 00000 n +0000000633 00000 n +0000001760 00000 n +0000000021 00000 n +0000000352 00000 n +0000002460 00000 n +0000002548 00000 n +trailer +<< +/Size 13 +/Root 3 0 R +/Info 1 0 R +/ID [<47149510433dd4882f05f8c124223734><47149510433dd4882f05f8c124223734>] +>> +startxref +2726 +%%EOF http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/src/test/resources/rsstest.rss ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/rsstest.rss b/nutch-plugins/parse-tika/src/test/resources/rsstest.rss new file mode 100644 index 0000000..6c4ae48 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/resources/rsstest.rss @@ -0,0 +1,37 @@ +<?xml version="1.0" encoding="ISO-8859-1" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<rss version="0.91"> + <channel> + <title>TestChannel</title> + <link>http://test.channel.com/</link> + <description>Sample RSS File for Junit test</description> + <language>en-us</language> + + <item> + <title>Home Page of Chris Mattmann</title> + <link>http://www-scf.usc.edu/~mattmann/</link> + <description>Chris Mattmann's home page</description> + </item> + + <item> + <title>Awesome Open Source Search Engine</title> + <link>http://www.nutch.org/</link> + <description>Yup, that's what it is</description> + </item> + </channel> +</rss> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/src/test/resources/test.rtf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/test.rtf b/nutch-plugins/parse-tika/src/test/resources/test.rtf new file mode 100644 index 0000000..c67a6c8 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/resources/test.rtf @@ -0,0 +1,17 @@ +{\rtf1\ansi\deff1\adeflang1025 +{\fonttbl{\f0\froman\fprq2\fcharset0 Times;}{\f1\froman\fprq2\fcharset0 Times New Roman;}{\f2\fmodern\fprq1\fcharset0 Courier New;}{\f3\froman\fprq2\fcharset0 Times New Roman;}{\f4\fnil\fprq2\fcharset0 Interface User;}{\f5\fnil\fprq2\fcharset0 Lucidasans;}{\f6\fnil\fprq0\fcharset0 Lucidasans;}} +{\colortbl;\red0\green0\blue0;\red0\green0\blue128;\red128\green128\blue128;} +{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033\snext1 Default;} +{\s2\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext2 Text body;} +{\s3\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\af1\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon2\snext3 List;} +{\s4\sb120\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs20\lang255\ai\ltrch\dbch\afs20\langfe255\ai\loch\f1\fs20\lang1033\i\sbasedon1\snext4 Caption;} +{\s5\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext5 Index;} +{\*\cs7\cf0\rtlch\af2\afs24\lang255\ltrch\dbch\af2\afs24\langfe255\loch\f2\fs24\lang1033 Teletype;} +{\*\cs8\cf2\ul\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\fs24\lang1033 Internet Link;} +} +{\info{\title test rft document}{\subject tests}{\creatim\yr2004\mo9\dy20\hr19\min36}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6450}}\deftab709 +{\*\pgdsctbl +{\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\pgdscnxt0 Default;}} +{\*\pgdscno0}\paperh16837\paperw11905\margl1800\margr1800\margt1440\margb1440\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc +\pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\ql\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033{\loch\f2\fs24\lang1033\i0\b0\*\cs7\cf0\rtlch\ltrch\dbch\loch\f2\fs24\lang1033 The quick brown fox jumps over the lazy dog} +\par } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/src/test/resources/word97.doc ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/word97.doc b/nutch-plugins/parse-tika/src/test/resources/word97.doc new file mode 100644 index 0000000..4d012da Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/word97.doc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-zip/sample/test.zip ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/sample/test.zip b/nutch-plugins/parse-zip/sample/test.zip deleted file mode 100644 index 0c649d2..0000000 Binary files a/nutch-plugins/parse-zip/sample/test.zip and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-zip/src/test/resources/test.zip ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/src/test/resources/test.zip b/nutch-plugins/parse-zip/src/test/resources/test.zip new file mode 100644 index 0000000..0c649d2 Binary files /dev/null and b/nutch-plugins/parse-zip/src/test/resources/test.zip differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt b/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt deleted file mode 100644 index 9d15cd8..0000000 --- a/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt +++ /dev/null @@ -1,10 +0,0 @@ -# Example configuration file for parsefilter-regex -# -# Parse metadata field <name> is set to true if the HTML matches the regex. The -# source can either be html or text. If source is html, the regex is applied to -# the entire HTML tree. If source is text, the regex is applied to the -# extracted text. -# -# format: <name>\t<source>\t<regex>\n -first html h1 -second text blablabla http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt b/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt new file mode 100644 index 0000000..9d15cd8 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt @@ -0,0 +1,10 @@ +# Example configuration file for parsefilter-regex +# +# Parse metadata field <name> is set to true if the HTML matches the regex. The +# source can either be html or text. If source is html, the regex is applied to +# the entire HTML tree. If source is text, the regex is applied to the +# extracted text. +# +# format: <name>\t<source>\t<regex>\n +first html h1 +second text blablabla http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/protocol-file/sample/testprotocolfile.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/sample/testprotocolfile.txt b/nutch-plugins/protocol-file/sample/testprotocolfile.txt deleted file mode 100644 index fbe8a8a..0000000 --- a/nutch-plugins/protocol-file/sample/testprotocolfile.txt +++ /dev/null @@ -1 +0,0 @@ -Protocol File Test http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/protocol-file/sample/testprotocolfile_(encoded).txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/sample/testprotocolfile_(encoded).txt b/nutch-plugins/protocol-file/sample/testprotocolfile_(encoded).txt deleted file mode 100644 index fbe8a8a..0000000 --- a/nutch-plugins/protocol-file/sample/testprotocolfile_(encoded).txt +++ /dev/null @@ -1 +0,0 @@ -Protocol File Test http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt new file mode 100644 index 0000000..fbe8a8a --- /dev/null +++ b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt @@ -0,0 +1 @@ +Protocol File Test http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt new file mode 100644 index 0000000..fbe8a8a --- /dev/null +++ b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt @@ -0,0 +1 @@ +Protocol File Test http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules b/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules deleted file mode 100644 index a2f6da0..0000000 --- a/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules +++ /dev/null @@ -1,26 +0,0 @@ -# The url filter file used by the crawl command. - -# Better for intranet crawling. -# Be sure to change MY.DOMAIN.NAME to your domain name. - -# Each non-comment, non-blank line contains a regular expression -# prefixed by '+' or '-'. The first matching pattern in the file -# determines whether a URL is included or ignored. If no pattern -# matches, the URL is ignored. - -# skip file:, ftp:, & mailto: urls --(file|ftp|mailto):.* - -# skip image and other suffixes we can't yet parse --.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png) - -# skip URLs containing certain characters as probable queries, etc. --.*[?*!@=].* - -# skip .fr .org and .net domains --.*//.*\.fr/.* --.*//.*\.org/.* --.*//.*\.net/.* - -# skip everything else -+.* http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls b/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls deleted file mode 100644 index 40bf4ee..0000000 --- a/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls +++ /dev/null @@ -1,297 +0,0 @@ -+http://www.hostip.info/ --http://www.elanceur.org/Articles/OntologieSurfaite.html -+http://www.opensymphony.com/quartz/ --http://www.portletbridge.org/saxbenchmark/index.html -+http://www.lesmotsdelinfo.com/ -+http://usefulinc.com/doap/ -+http://www.codezoo.com/ -+http://search.infocious.com/ --http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html -+http://www.brics.dk/%7Eamoeller/automaton/ -+http://jazzz.com/wp.html -+http://www.maxkiesler.com/index.php -+http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html -+http://www.alias-i.com/lingpipe/ --http://johnny.ihackstuff.com/index.php?module=prodreviews --http://www.spurl.net/ -+http://www.dropload.com/ -+http://vivisimo.com/ -+http://www.marumushi.com/apps/newsmap/newsmap.cfm -+http://www.ixquick.com/ --http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html -+http://www.mail-archive.com/ -+http://www.spymac.com/ --http://browsers.evolt.org/ --http://www.oswd.org/ -+http://www.stayinvisible.com/index.pl -+http://java.sun.com/j2se/1.4.2/docs/api/index.html -+http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx -+http://www.bloglines.com/ --http://www.fckeditor.net/ -+http://search.msn.com/ --http://www.grub.org/ -+http://www.xml.com/pub/a/2000/11/29/schemas/part1.html --http://www.mnot.net/cache_docs/ --http://www.furl.net/ -+http://www.blogpulse.com/ -+http://www.googlefight.com/ -+http://www.rokulabs.com/ --http://mightylegends.zapto.org/dvd/dvdauthor_howto.php --http://www.batbox.org/wrt54g-linux.html --http://en.wikipedia.org/wiki/%s -+http://www.sipcenter.com/ -+http://www.merriampark.com/ld.htm -+http://anon.inf.tu-dresden.de/index_en.html -+http://www.pluck.com/ -+http://www.tiddlywiki.com/ -+http://www.jux2.com/ -+http://clusty.com/ --http://findability.org/ -+http://www.searchengineshowdown.com/ -+http://www.nhacks.com/email/index.php -+http://www.koders.com/ -+http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf -+http://www.gmailwiki.com/index.php/Main_Page -+http://www.tadalist.com/ -+http://www.net2ftp.com/ -+http://www.streamload.com/ -+http://www.lucazappa.com/brilliantMaker/buttonImage.php -+http://www.hybernaut.com/bdv/delicious-import.html -+http://www.gtmcknight.com/buttons/ -+http://amb.vis.ne.jp/mozilla/scrapbook/ -+http://g-metrics.com/index.php --http://tor.eff.org/ -+http://www.search-this.com/search_engine_decoder.asp -+http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html -+http://www.adaptivepath.com/publications/essays/archives/000385.php --http://isnoop.net/gmail/ --http://openweb.eu.org/ -+http://www.mistergooddeal.com/ -+http://javatoolbox.com/ --http://www.freenews.fr/ -+http://www.wikiwax.com/ --http://today.java.net/pub/a/today/2005/04/21/farm.html -+http://users.skynet.be/J.Beever/pave.htm -+http://www.lundi8h.com/ -+http://www.snap.com/ -+http://www.goosee.com/puppy/index.shtml --http://www.softwarefreedom.org/index.html --http://y.20q.net/ -+http://www.bitty.com/ -+http://www.lafraise.com/ --http://www.liquidinformation.org/ -+http://www.searchtools.com/ -+http://www.martinfowler.com/articles/injection.html -+http://pdos.csail.mit.edu/scigen/ --http://developer.yahoo.net/blog/ -+http://blogger-templates.blogspot.com/ -+http://phpadsnew.com/two/ -+http://www.langreiter.com/exec/yahoo-vs-google.html --http://www.dataparksearch.org/ --http://www.yubnub.org/ --http://www.fing.org/ --http://www.swish-e.org/ --http://www.openajax.net/wordpress/ -+http://crypto.stanford.edu/PwdHash/ -+http://www.html-kit.com/favicon/ --http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1 -+http://www.durhamtownship.com/ -+http://jiwire.com/ -+http://www.insilmaril.de/vym/ --http://www.spreadshirt.net/ -+http://www.goffice.com/ -+http://www.writely.com/ -+http://www.milindparikh.com/ -+http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html -+http://www.wikyblog.com/Map/Guest/Home --http://www.kottke.org/05/08/googleos-webos -+http://www.rollyo.com/ -+http://www.meebo.com/ -+http://www.factbites.com/ -+http://www.placeopedia.com/ -+http://swoogle.umbc.edu/ -+http://www.viaduc.com/ --http://demo.wikiwyg.net/wikiwyg/demo/standalone/ -+http://podcasts.yahoo.com/ --http://beaglewiki.org/Main_Page -+http://yq.search.yahoo.com/ --http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1 -+http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html -+http://socialight.com/ -+http://www.lexxe.com/ -+http://www.xom.nu/ -+http://www.turboprint.de/ -+http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27 -+http://www.wi-fiplanet.com/tutorials/article.php/3562391 -+http://particletree.com/features/10-tips-to-a-better-form/ -+http://www.songbirdnest.com/ --http://www.w3.org/Talks/Tools/Slidy/ --http://www.compassframework.org/display/SITE/Home -+http://motrech.blogspot.com/ -+http://www.moteurzine.com/ -+http://www.mex-search.com/ --http://beta.previewseek.com/?mdc=y&twin=n&ilang=french -+http://www.goshme.com/ -+http://rialto.application-servers.com/ -+http://www.multe-pass.com/ -+http://www.tailrank.com/ -+http://www.vandertramp.com/INTERNETDOWN/ -+http://www.letterjames.de/index.html -+http://code.google.com/index.html -+http://www.kritx.com/ -+http://performancing.com/firefox -+http://www.mywebsearch.com/ --http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1 -+http://www.lukew.com/resources/articles/blogs2.asp --http://www.hyperwords.net/ -+http://ajax.parish.ath.cx/translator/ -+http://www.maplandia.com/ --http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages -+http://onefeed.com/index.php -+http://www.file-swap.com/ --http://opennlp.org/ -+http://mindprod.com/jgloss/encoding.html -+http://code.google.com/webstats/index.html -+http://www.freeweb-hosting.com/google_pagerank_pr_checker/ --http://www.framakey.org/ --http://microformats.org/wiki/hreview --http://www.ashesandsnow.org/index2.html --http://uima-framework.sourceforge.net/ -+http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html --http://www.anandtech.com/IT/showdoc.aspx?i=2523&p=2 -+http://fr.techcrunch.com/ --http://developer.yahoo.net/yui/ -+http://www.fredrikodman.com/ -+http://www.mpirical.com/companion/mpirical_companion.html -+http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html --http://k9copy.free.fr/ --http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 --http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design --http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2 -+http://blogokat.canalblog.com/archives/2005/11/02/882454.html -+http://robur.slu.se/jensl/xmlclitools/ --http://www.internetactu.net/?p=6291 --http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1 -+http://www.memodata.com/2004/fr/alexandria/ --http://presse-citron.net/?2006/01/23/654-joomla-pete-grave -+http://www.randomerror.com/ -+http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/ --http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395 --http://interstices.info/display.jsp?id=c_15918 -+http://www.tech-invite.com/ -+http://www.croczilla.com/zap --http://www.libervis.com/modules/wordpress/?p=13 -+http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/ --http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm -+http://www.influo.com/ -+http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html --http://www.addnb.org/fr/docs/webinvisible.htm --http://manhack.net/ --http://www.jibaku.net/ -+http://www.pipologie.com/ -+http://christophenoel.blogspot.com/ --http://www.seekport.fr/seekbot/ -+http://beta.exalead.com/ --http://www.boolgum.fr/index.html -+http://www.kesako.canalblog.com/ -+http://loran.blogspot.com/ -+http://outils-recherche.blogspot.com/ -+http://www.art-dept.com/artists/giacobbe/ -+http://www.meggould.netfirms.com/site_seeingIII.htm -+http://www.freedpi.com/ -+http://www.frenchfred.com/ -+http://www.photoways.com/ --http://freco.free.fr/index.htm --http://triturages.free.fr/index.htm --http://www.qsos.org/ -+http://www.alvis.info/alvis/ -+http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/ --http://www.shinux.org/ -+http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml -+http://www.kurobox.com/online/tiki-index.php --http://news.gmane.org/gmane.comp.misc.linkstation.linux -+http://www.imsbook.com/SIP-IMS-Standards-List.html --http://incubator.apache.org/directory/subprojects/snickers/ --http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html --http://sourceforge.net/projects/cryptix-asn1/ --http://sourceforge.net/projects/basn/ --http://asn1.elibel.tm.fr/fr/index.htm --http://sourceforge.net/projects/a2j/ -+http://www.degrouptest.com/ -+http://interstices.info/ -+http://louvre-boite.viabloga.com/news/18.shtml --http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html -+http://poiplace.oabsoftware.nl/ --http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759 --http://www.yoono.com/favorites.jsp?user-id=lquerel --http://www.librecours.org/cgi-bin/main --http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1 --http://limo.sourceforge.net/ -+http://www-scf.usc.edu/%7Emattmann/ -+http://spaces.msn.com/members/famillezen/ --http://photos.joune.org/ --http://www.canon.fr/paperart/ -+http://flash.eastweb.ru/files/20051024092150.swf -+http://www.xsltwiki.com/index.php/Main_Page -+http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/ --http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31 -+http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html --http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/ -+http://www.aeliosfinance.com/ -+http://www.capital-it.com/ --http://www.tradedoubler.fr/pan/public/solutions/publisher --http://www.recherche.gouv.fr/technologie/concours/2006/index.htm -+http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/ -+http://wanabo.com/ --http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1 --http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam -+http://aeliosfinance.com/ -+http://www.centreincubation.com/ -+http://www.franceincubation.com/ --http://www.oseo.fr/ -+http://www.i18nfaq.com/chardet.html --http://cpdetector.sourceforge.net/ -+http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles -+http://chezlorry.ca/Accueil.htm -+http://cetnia.blogs.com/d_lires/ --http://www.directwine.fr/ -+http://www.new-phenix.com/ --http://upnp.sourceforge.net/ --http://www.pixmania.fr/ --http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 -+http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/ -+http://www.stepnewz.com/sn/default.asp -+http://opquast.com/ --http://www.freeplayer.org/ --http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie --http://atomcomputer.free.fr/fbox/ --http://www.internetactu.net/index.php?p=6100 --http://mammouthland.free.fr/cours/css/genecss.php --http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1 -+http://www-106.ibm.com/developerworks/xml/library/x-xapi.html --http://xml.apache.org/xalan-j/extensions.html -+http://developers.sun.com/foryourbusiness/jcc/ -+http://blogs.sun.com/roller/page/roumen/Weblog --http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1 --http://blog.developpez.com/index.php?blog=51&p=1389&more=1&c=1&tb=1&pb=1 -+http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/ -+http://odur.let.rug.nl/%7Evannoord/ --http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html --http://artist.inist.fr/ -+http://www.elra.info/ --http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO -+http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability -+http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval -+http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/ -+http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/ -+http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/ -+http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/ -+http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/ -+http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html --http://www.lexique.org/ -+http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/ -+http://www.streamium.com/products/mx6000i/ --http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr --http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73 -+http://www.tversity.com/ --http://www.aspseek.org/index.php \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules b/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules deleted file mode 100644 index 8966183..0000000 --- a/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules +++ /dev/null @@ -1,24 +0,0 @@ -# The url filter file used by the crawl command. - -# Better for intranet crawling. -# Be sure to change MY.DOMAIN.NAME to your domain name. - -# Each non-comment, non-blank line contains a regular expression -# prefixed by '+' or '-'. The first matching pattern in the file -# determines whether a URL is included or ignored. If no pattern -# matches, the URL is ignored. - -# skip file:, ftp:, & mailto: urls --(file|ftp|mailto):.* - -# skip image and other suffixes we can't yet parse --.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png) - -# skip URLs containing certain characters as probable queries, etc. --.*[?*!@=].* - -# accept hosts in MY.DOMAIN.NAME -+http://([a-z0-9]*\.)*MY.DOMAIN.NAME/.* - -# skip everything else --.* http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls b/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls deleted file mode 100644 index b1ad9b7..0000000 --- a/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls +++ /dev/null @@ -1,8 +0,0 @@ --file://home/jc/nutch/index.html --ftp://ftp.apache.org/nutch.html --mailto:[email protected] --news://any.news.server/comp.lang.java --whois:/nutch.org -+http://MY.DOMAIN.NAME/ -+http://MY.DOMAIN.NAME/nutch -+http://www.MY.DOMAIN.NAME/ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules b/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules deleted file mode 100644 index dfae8b0..0000000 --- a/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules +++ /dev/null @@ -1,19 +0,0 @@ -# The default url filter. -# Better for whole-internet crawling. - -# Each non-comment, non-blank line contains a regular expression -# prefixed by '+' or '-'. The first matching pattern in the file -# determines whether a URL is included or ignored. If no pattern -# matches, the URL is ignored. - -# skip file: ftp: and mailto: urls --(file|ftp|mailto):.* - -# skip image and other suffixes we can't yet parse --.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe) - -# skip URLs containing certain characters as probable queries, etc. --.*[?*!@=].* - -# accept anything else -+.* http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls b/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls deleted file mode 100644 index d3b1bf3..0000000 --- a/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls +++ /dev/null @@ -1,11 +0,0 @@ --file://home/jc/nutch/index.html --ftp://ftp.apache.org/nutch.html --mailto:[email protected] -+news://any.news.server/comp.lang.java -+whois:/nutch.org --http://www.nutch.org/nutch.gif --http://www.nutch.org/nutch.eps --http://www.nutch.org/nutch?q=nutch -+http://www.nutch.org/ -+http://www.nutch.org/abcd/foo/bar/foo/bar/foo/ -+http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules new file mode 100644 index 0000000..a2f6da0 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules @@ -0,0 +1,26 @@ +# The url filter file used by the crawl command. + +# Better for intranet crawling. +# Be sure to change MY.DOMAIN.NAME to your domain name. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-(file|ftp|mailto):.* + +# skip image and other suffixes we can't yet parse +-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png) + +# skip URLs containing certain characters as probable queries, etc. +-.*[?*!@=].* + +# skip .fr .org and .net domains +-.*//.*\.fr/.* +-.*//.*\.org/.* +-.*//.*\.net/.* + +# skip everything else ++.* http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls new file mode 100644 index 0000000..40bf4ee --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls @@ -0,0 +1,297 @@ ++http://www.hostip.info/ +-http://www.elanceur.org/Articles/OntologieSurfaite.html ++http://www.opensymphony.com/quartz/ +-http://www.portletbridge.org/saxbenchmark/index.html ++http://www.lesmotsdelinfo.com/ ++http://usefulinc.com/doap/ ++http://www.codezoo.com/ ++http://search.infocious.com/ +-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html ++http://www.brics.dk/%7Eamoeller/automaton/ ++http://jazzz.com/wp.html ++http://www.maxkiesler.com/index.php ++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html ++http://www.alias-i.com/lingpipe/ +-http://johnny.ihackstuff.com/index.php?module=prodreviews +-http://www.spurl.net/ ++http://www.dropload.com/ ++http://vivisimo.com/ ++http://www.marumushi.com/apps/newsmap/newsmap.cfm ++http://www.ixquick.com/ +-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html ++http://www.mail-archive.com/ ++http://www.spymac.com/ +-http://browsers.evolt.org/ +-http://www.oswd.org/ ++http://www.stayinvisible.com/index.pl ++http://java.sun.com/j2se/1.4.2/docs/api/index.html ++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx ++http://www.bloglines.com/ +-http://www.fckeditor.net/ ++http://search.msn.com/ +-http://www.grub.org/ ++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html +-http://www.mnot.net/cache_docs/ +-http://www.furl.net/ ++http://www.blogpulse.com/ ++http://www.googlefight.com/ ++http://www.rokulabs.com/ +-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php +-http://www.batbox.org/wrt54g-linux.html +-http://en.wikipedia.org/wiki/%s ++http://www.sipcenter.com/ ++http://www.merriampark.com/ld.htm ++http://anon.inf.tu-dresden.de/index_en.html ++http://www.pluck.com/ ++http://www.tiddlywiki.com/ ++http://www.jux2.com/ ++http://clusty.com/ +-http://findability.org/ ++http://www.searchengineshowdown.com/ ++http://www.nhacks.com/email/index.php ++http://www.koders.com/ ++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf ++http://www.gmailwiki.com/index.php/Main_Page ++http://www.tadalist.com/ ++http://www.net2ftp.com/ ++http://www.streamload.com/ ++http://www.lucazappa.com/brilliantMaker/buttonImage.php ++http://www.hybernaut.com/bdv/delicious-import.html ++http://www.gtmcknight.com/buttons/ ++http://amb.vis.ne.jp/mozilla/scrapbook/ ++http://g-metrics.com/index.php +-http://tor.eff.org/ ++http://www.search-this.com/search_engine_decoder.asp ++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html ++http://www.adaptivepath.com/publications/essays/archives/000385.php +-http://isnoop.net/gmail/ +-http://openweb.eu.org/ ++http://www.mistergooddeal.com/ ++http://javatoolbox.com/ +-http://www.freenews.fr/ ++http://www.wikiwax.com/ +-http://today.java.net/pub/a/today/2005/04/21/farm.html ++http://users.skynet.be/J.Beever/pave.htm ++http://www.lundi8h.com/ ++http://www.snap.com/ ++http://www.goosee.com/puppy/index.shtml +-http://www.softwarefreedom.org/index.html +-http://y.20q.net/ ++http://www.bitty.com/ ++http://www.lafraise.com/ +-http://www.liquidinformation.org/ ++http://www.searchtools.com/ ++http://www.martinfowler.com/articles/injection.html ++http://pdos.csail.mit.edu/scigen/ +-http://developer.yahoo.net/blog/ ++http://blogger-templates.blogspot.com/ ++http://phpadsnew.com/two/ ++http://www.langreiter.com/exec/yahoo-vs-google.html +-http://www.dataparksearch.org/ +-http://www.yubnub.org/ +-http://www.fing.org/ +-http://www.swish-e.org/ +-http://www.openajax.net/wordpress/ ++http://crypto.stanford.edu/PwdHash/ ++http://www.html-kit.com/favicon/ +-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1 ++http://www.durhamtownship.com/ ++http://jiwire.com/ ++http://www.insilmaril.de/vym/ +-http://www.spreadshirt.net/ ++http://www.goffice.com/ ++http://www.writely.com/ ++http://www.milindparikh.com/ ++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html ++http://www.wikyblog.com/Map/Guest/Home +-http://www.kottke.org/05/08/googleos-webos ++http://www.rollyo.com/ ++http://www.meebo.com/ ++http://www.factbites.com/ ++http://www.placeopedia.com/ ++http://swoogle.umbc.edu/ ++http://www.viaduc.com/ +-http://demo.wikiwyg.net/wikiwyg/demo/standalone/ ++http://podcasts.yahoo.com/ +-http://beaglewiki.org/Main_Page ++http://yq.search.yahoo.com/ +-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1 ++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html ++http://socialight.com/ ++http://www.lexxe.com/ ++http://www.xom.nu/ ++http://www.turboprint.de/ ++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27 ++http://www.wi-fiplanet.com/tutorials/article.php/3562391 ++http://particletree.com/features/10-tips-to-a-better-form/ ++http://www.songbirdnest.com/ +-http://www.w3.org/Talks/Tools/Slidy/ +-http://www.compassframework.org/display/SITE/Home ++http://motrech.blogspot.com/ ++http://www.moteurzine.com/ ++http://www.mex-search.com/ +-http://beta.previewseek.com/?mdc=y&twin=n&ilang=french ++http://www.goshme.com/ ++http://rialto.application-servers.com/ ++http://www.multe-pass.com/ ++http://www.tailrank.com/ ++http://www.vandertramp.com/INTERNETDOWN/ ++http://www.letterjames.de/index.html ++http://code.google.com/index.html ++http://www.kritx.com/ ++http://performancing.com/firefox ++http://www.mywebsearch.com/ +-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1 ++http://www.lukew.com/resources/articles/blogs2.asp +-http://www.hyperwords.net/ ++http://ajax.parish.ath.cx/translator/ ++http://www.maplandia.com/ +-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages ++http://onefeed.com/index.php ++http://www.file-swap.com/ +-http://opennlp.org/ ++http://mindprod.com/jgloss/encoding.html ++http://code.google.com/webstats/index.html ++http://www.freeweb-hosting.com/google_pagerank_pr_checker/ +-http://www.framakey.org/ +-http://microformats.org/wiki/hreview +-http://www.ashesandsnow.org/index2.html +-http://uima-framework.sourceforge.net/ ++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html +-http://www.anandtech.com/IT/showdoc.aspx?i=2523&p=2 ++http://fr.techcrunch.com/ +-http://developer.yahoo.net/yui/ ++http://www.fredrikodman.com/ ++http://www.mpirical.com/companion/mpirical_companion.html ++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html +-http://k9copy.free.fr/ +-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 +-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design +-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2 ++http://blogokat.canalblog.com/archives/2005/11/02/882454.html ++http://robur.slu.se/jensl/xmlclitools/ +-http://www.internetactu.net/?p=6291 +-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1 ++http://www.memodata.com/2004/fr/alexandria/ +-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave ++http://www.randomerror.com/ ++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/ +-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395 +-http://interstices.info/display.jsp?id=c_15918 ++http://www.tech-invite.com/ ++http://www.croczilla.com/zap +-http://www.libervis.com/modules/wordpress/?p=13 ++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/ +-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm ++http://www.influo.com/ ++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html +-http://www.addnb.org/fr/docs/webinvisible.htm +-http://manhack.net/ +-http://www.jibaku.net/ ++http://www.pipologie.com/ ++http://christophenoel.blogspot.com/ +-http://www.seekport.fr/seekbot/ ++http://beta.exalead.com/ +-http://www.boolgum.fr/index.html ++http://www.kesako.canalblog.com/ ++http://loran.blogspot.com/ ++http://outils-recherche.blogspot.com/ ++http://www.art-dept.com/artists/giacobbe/ ++http://www.meggould.netfirms.com/site_seeingIII.htm ++http://www.freedpi.com/ ++http://www.frenchfred.com/ ++http://www.photoways.com/ +-http://freco.free.fr/index.htm +-http://triturages.free.fr/index.htm +-http://www.qsos.org/ ++http://www.alvis.info/alvis/ ++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/ +-http://www.shinux.org/ ++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml ++http://www.kurobox.com/online/tiki-index.php +-http://news.gmane.org/gmane.comp.misc.linkstation.linux ++http://www.imsbook.com/SIP-IMS-Standards-List.html +-http://incubator.apache.org/directory/subprojects/snickers/ +-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html +-http://sourceforge.net/projects/cryptix-asn1/ +-http://sourceforge.net/projects/basn/ +-http://asn1.elibel.tm.fr/fr/index.htm +-http://sourceforge.net/projects/a2j/ ++http://www.degrouptest.com/ ++http://interstices.info/ ++http://louvre-boite.viabloga.com/news/18.shtml +-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html ++http://poiplace.oabsoftware.nl/ +-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759 +-http://www.yoono.com/favorites.jsp?user-id=lquerel +-http://www.librecours.org/cgi-bin/main +-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1 +-http://limo.sourceforge.net/ ++http://www-scf.usc.edu/%7Emattmann/ ++http://spaces.msn.com/members/famillezen/ +-http://photos.joune.org/ +-http://www.canon.fr/paperart/ ++http://flash.eastweb.ru/files/20051024092150.swf ++http://www.xsltwiki.com/index.php/Main_Page ++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/ +-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31 ++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html +-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/ ++http://www.aeliosfinance.com/ ++http://www.capital-it.com/ +-http://www.tradedoubler.fr/pan/public/solutions/publisher +-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm ++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/ ++http://wanabo.com/ +-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1 +-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam ++http://aeliosfinance.com/ ++http://www.centreincubation.com/ ++http://www.franceincubation.com/ +-http://www.oseo.fr/ ++http://www.i18nfaq.com/chardet.html +-http://cpdetector.sourceforge.net/ ++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles ++http://chezlorry.ca/Accueil.htm ++http://cetnia.blogs.com/d_lires/ +-http://www.directwine.fr/ ++http://www.new-phenix.com/ +-http://upnp.sourceforge.net/ +-http://www.pixmania.fr/ +-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 ++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/ ++http://www.stepnewz.com/sn/default.asp ++http://opquast.com/ +-http://www.freeplayer.org/ +-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie +-http://atomcomputer.free.fr/fbox/ +-http://www.internetactu.net/index.php?p=6100 +-http://mammouthland.free.fr/cours/css/genecss.php +-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1 ++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html +-http://xml.apache.org/xalan-j/extensions.html ++http://developers.sun.com/foryourbusiness/jcc/ ++http://blogs.sun.com/roller/page/roumen/Weblog +-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1 +-http://blog.developpez.com/index.php?blog=51&p=1389&more=1&c=1&tb=1&pb=1 ++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/ ++http://odur.let.rug.nl/%7Evannoord/ +-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html +-http://artist.inist.fr/ ++http://www.elra.info/ +-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO ++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability ++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval ++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/ ++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/ ++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/ ++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/ ++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/ ++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html +-http://www.lexique.org/ ++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/ ++http://www.streamium.com/products/mx6000i/ +-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr +-http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73 ++http://www.tversity.com/ +-http://www.aspseek.org/index.php \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules new file mode 100644 index 0000000..8966183 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules @@ -0,0 +1,24 @@ +# The url filter file used by the crawl command. + +# Better for intranet crawling. +# Be sure to change MY.DOMAIN.NAME to your domain name. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-(file|ftp|mailto):.* + +# skip image and other suffixes we can't yet parse +-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png) + +# skip URLs containing certain characters as probable queries, etc. +-.*[?*!@=].* + +# accept hosts in MY.DOMAIN.NAME ++http://([a-z0-9]*\.)*MY.DOMAIN.NAME/.* + +# skip everything else +-.* http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls new file mode 100644 index 0000000..b1ad9b7 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls @@ -0,0 +1,8 @@ +-file://home/jc/nutch/index.html +-ftp://ftp.apache.org/nutch.html +-mailto:[email protected] +-news://any.news.server/comp.lang.java +-whois:/nutch.org ++http://MY.DOMAIN.NAME/ ++http://MY.DOMAIN.NAME/nutch ++http://www.MY.DOMAIN.NAME/ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules new file mode 100644 index 0000000..dfae8b0 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules @@ -0,0 +1,19 @@ +# The default url filter. +# Better for whole-internet crawling. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file: ftp: and mailto: urls +-(file|ftp|mailto):.* + +# skip image and other suffixes we can't yet parse +-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe) + +# skip URLs containing certain characters as probable queries, etc. +-.*[?*!@=].* + +# accept anything else ++.* http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls new file mode 100644 index 0000000..d3b1bf3 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls @@ -0,0 +1,11 @@ +-file://home/jc/nutch/index.html +-ftp://ftp.apache.org/nutch.html +-mailto:[email protected] ++news://any.news.server/comp.lang.java ++whois:/nutch.org +-http://www.nutch.org/nutch.gif +-http://www.nutch.org/nutch.eps +-http://www.nutch.org/nutch?q=nutch ++http://www.nutch.org/ ++http://www.nutch.org/abcd/foo/bar/foo/bar/foo/ ++http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-domain/data/hosts.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/data/hosts.txt b/nutch-plugins/urlfilter-domain/data/hosts.txt deleted file mode 100644 index 2b88c3b..0000000 --- a/nutch-plugins/urlfilter-domain/data/hosts.txt +++ /dev/null @@ -1,5 +0,0 @@ -# comments start with the pound sign -net -apache.org -be -www.yahoo.com \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt b/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt new file mode 100644 index 0000000..2b88c3b --- /dev/null +++ b/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt @@ -0,0 +1,5 @@ +# comments start with the pound sign +net +apache.org +be +www.yahoo.com \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-domainblacklist/data/hosts.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/data/hosts.txt b/nutch-plugins/urlfilter-domainblacklist/data/hosts.txt deleted file mode 100644 index 2b88c3b..0000000 --- a/nutch-plugins/urlfilter-domainblacklist/data/hosts.txt +++ /dev/null @@ -1,5 +0,0 @@ -# comments start with the pound sign -net -apache.org -be -www.yahoo.com \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt b/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt new file mode 100644 index 0000000..2b88c3b --- /dev/null +++ b/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt @@ -0,0 +1,5 @@ +# comments start with the pound sign +net +apache.org +be +www.yahoo.com \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-ignoreexempt/data/.donotdelete ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-ignoreexempt/data/.donotdelete b/nutch-plugins/urlfilter-ignoreexempt/data/.donotdelete deleted file mode 100644 index e69de29..0000000 http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/urlfilter-regex/sample/Benchmarks.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/sample/Benchmarks.rules b/nutch-plugins/urlfilter-regex/sample/Benchmarks.rules deleted file mode 100644 index c8901e2..0000000 --- a/nutch-plugins/urlfilter-regex/sample/Benchmarks.rules +++ /dev/null @@ -1,26 +0,0 @@ -# The url filter file used by the crawl command. - -# Better for intranet crawling. -# Be sure to change MY.DOMAIN.NAME to your domain name. - -# Each non-comment, non-blank line contains a regular expression -# prefixed by '+' or '-'. The first matching pattern in the file -# determines whether a URL is included or ignored. If no pattern -# matches, the URL is ignored. - -# skip file:, ftp:, & mailto: urls --^(file|ftp|mailto): - -# skip image and other suffixes we can't yet parse --\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$ - -# skip URLs containing certain characters as probable queries, etc. --[?*!@=] - -# skip .fr .org and .net domains --^.*//.*\.fr/ --^.*//.*\.org/ --^.*//.*\.net/ - -# skip everything else -+.
