svn commit: r379403 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/parse/ src/plugin/creativecommons/src/test/org/creativecommons/nutch/ src/plugin/languageidentifier/src/java/org/apache/nu

2006-02-21 Thread jerome
Author: jerome
Date: Tue Feb 21 01:54:21 2006
New Revision: 379403

URL: http://svn.apache.org/viewcvs?rev=379403view=rev
Log:
NUTCH-140, parse-plugin.xml can now use extension-id and plugin-id

Modified:
lucene/nutch/trunk/conf/parse-plugins.dtd
lucene/nutch/trunk/conf/parse-plugins.xml
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java

lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java

lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java

lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java

lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java

lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java

lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java

lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java

lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java

lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java

lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java

lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml

Modified: lucene/nutch/trunk/conf/parse-plugins.dtd
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=379403r1=379402r2=379403view=diff
==
--- lucene/nutch/trunk/conf/parse-plugins.dtd (original)
+++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Feb 21 01:54:21 2006
@@ -1,7 +1,12 @@
-!ELEMENT parse-plugins (mimeType+)
+!ELEMENT parse-plugins  (mimeType+,aliases)
 !ELEMENT mimeType (plugin+)
 !ATTLIST mimeType name CDATA #REQUIRED
 
 !ELEMENT plugin EMPTY
 !ATTLIST plugin id CDATA #REQUIRED
-!ATTLIST plugin order CDATA ''
\ No newline at end of file
+!ATTLIST plugin order CDATA ''
+
+!ELEMENT aliases (alias+)
+!ELEMENT alias EMPTY
+!ATTLIST alias name CDATA #REQUIRED
+!ATTLIST alias extension-id CDATA #REQUIRED
\ No newline at end of file

Modified: lucene/nutch/trunk/conf/parse-plugins.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=379403r1=379402r2=379403view=diff
==
--- lucene/nutch/trunk/conf/parse-plugins.xml (original)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Feb 21 01:54:21 2006
@@ -218,4 +218,33 @@
plugin id=parse-ext /
/mimeType
 
+   !--  alias mappings for parse-xxx names to the actual extension 
implementation 
+   ids described in each plugin's plugin.xml file --
+   aliases
+   alias name=parse-ext extension-id=ExtParser /
+   alias name=parse-html
+   extension-id=org.apache.nutch.parse.html.HtmlParser /
+   alias name=parse-js extension-id=JSParser /
+   alias name=parse-mp3
+   extension-id=org.apache.nutch.parse.mp3.MP3Parser /
+   alias name=parse-msexcel
+   
extension-id=org.apache.nutch.parse.msexcel.MSExcelParser /
+   alias name=parse-mspowerpoint
+   
extension-id=org.apache.nutch.parse.mspowerpoint.MSPowerPointParser /
+   alias name=parse-msword
+   
extension-id=org.apache.nutch.parse.msword.MSWordParser /
+   alias name=parse-pdf
+   extension-id=org.apache.nutch.parse.pdf.PdfParser /
+   alias name=parse-rss
+   extension-id=org.apache.nutch.parse.rss.RSSParser /
+   alias name=parse-rtf
+   
extension-id=org.apache.nutch.parse.rtf.RTFParseFactory /
+   alias name=parse-swf
+   extension-id=org.apache.nutch.parse.swf.SWFParser /
+   alias name=parse-text
+   extension-id=org.apache.nutch.parse.text.TextParser /
+   alias name=parse-zip
+   extension-id=org.apache.nutch.parse.zip.ZipParser /
+   /aliases
+   
 /parse-plugins


svn commit: r379419 - in /lucene/nutch/trunk: site/mailing_lists.html site/mailing_lists.pdf src/site/src/documentation/content/xdocs/mailing_lists.xml

2006-02-21 Thread jerome
Author: jerome
Date: Tue Feb 21 03:10:42 2006
New Revision: 379419

URL: http://svn.apache.org/viewcvs?rev=379419view=rev
Log:
NUTCH-214, Add a search mailing list archive link (Jake Vanderdray)

Modified:
lucene/nutch/trunk/site/mailing_lists.html
lucene/nutch/trunk/site/mailing_lists.pdf

lucene/nutch/trunk/src/site/src/documentation/content/xdocs/mailing_lists.xml

Modified: lucene/nutch/trunk/site/mailing_lists.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/mailing_lists.html?rev=379419r1=379418r2=379419view=diff
==
--- lucene/nutch/trunk/site/mailing_lists.html (original)
+++ lucene/nutch/trunk/site/mailing_lists.html Tue Feb 21 03:10:42 2006
@@ -3,7 +3,7 @@
 head
 META http-equiv=Content-Type content=text/html; charset=UTF-8
 meta content=Apache Forrest name=Generator
-meta name=Forrest-version content=0.6
+meta name=Forrest-version content=0.7
 meta name=Forrest-skin-name content=pelt
 titleNutch Mailing Lists/title
 link type=text/css href=skin/basic.css rel=stylesheet
@@ -16,49 +16,22 @@
 body onload=init()
 script type=text/javascriptndeSetTextSize();/script
 div id=top
-!--+
-|breadtrail
-+--
 div class=breadtrail
 a href=http://www.apache.org/;Apache/a gt; a 
href=http://lucene.apache.org/;Lucene/a gt; a 
href=http://lucene.apache.org/nutch/;Nutch/ascript 
src=skin/breadcrumbs.js language=JavaScript type=text/javascript/script
 /div
-!--+
-|header
-+--
 div class=header
-!--+
-|start group logo
-+--
 div class=grouplogo
 a href=http://lucene.apache.org/;img class=logoImage alt=Lucene 
src=http://lucene.apache.org/java/docs/images/lucene_green_150.gif; 
title=Apache Lucene/a
 /div
-!--+
-|end group logo
-+--
-!--+
-|start Project Logo
-+--
 div class=projectlogo
 a href=http://lucene.apache.org/nutch/;img class=logoImage alt=Nutch 
src=images/nutch-logo.gif title=Open Source Web Search Software/a
 /div
-!--+
-|end Project Logo
-+--
-!--+
-|start Search
-+--
 div class=searchbox
 form action=http://www.google.com/search; method=get class=roundtopsmall
-input value=lucene.apache.org name=sitesearch type=hiddeninput 
onFocus=getBlank (this, 'Search the site with google:'); value=Search the 
site with google: size=25 name=q id=query type=textnbsp; 
-input name=Search value=Search type=submit
+input value=lucene.apache.org name=sitesearch type=hiddeninput 
onFocus=getBlank (this, 'Search the site with google'); size=25 name=q 
id=query type=text value=Search the site with googlenbsp; 
+input attr=value name=Search value=Search 
type=submit
 /form
 /div
-!--+
-|end search
-+--
-!--+
-|start Tabs
-+--
 ul id=tabs
 li class=current
 a class=base-selected href=index.htmlMain/a
@@ -67,110 +40,83 @@
 a class=base-not-selected href=http://wiki.apache.org/nutch/;Wiki/a
 /li
 /ul
-!--+
-|end Tabs
-+--
 /div
 /div
 div id=main
 div id=publishedStrip
-!--+
-|start Subtabs
-+--
 div id=level2tabs/div
-!--+
-|end Endtabs
-+--
-script type=text/javascript language=JavaScript!--
-  document.write(Published:  + document.lastModified);
-  //  --/script
-/div
-!--+
-|breadtrail
-+--
+script type=text/javascript!--
+document.write(textLast Published:/text  + document.lastModified);
+//  --/script
+/div
 div class=breadtrail
  
  nbsp;
/div
-!--+
-|start Menu, mainarea
-+--
-!--+
-|start Menu
-+--
 div id=menu
 div onclick=SwitchMenu('menu_1.1', 'skin/') id=menu_1.1Title 
class=menutitleProject/div
 div id=menu_1.1 class=menuitemgroup
 div class=menuitem
-a title= href=index.htmlNews/a
+a href=index.htmlNews/a
 /div
 div class=menuitem
-a title= href=about.htmlAbout/a
+a href=about.htmlAbout/a
 /div
 div class=menuitem
-a title= href=credits.htmlCredits/a
+a href=credits.htmlCredits/a
 /div
 div class=menuitem
-a title= href=http://www.cafepress.com/nutch/;Buy Stuff/a
+a href=http://www.cafepress.com/nutch/;Buy Stuff/a
 /div
 /div
 div onclick=SwitchMenu('menu_1.2', 'skin/') id=menu_1.2Title 
class=menutitleDocumentation/div
 div id=menu_1.2 class=menuitemgroup
 div class=menuitem
-a title= href=http://wiki.apache.org/nutch/FAQ;FAQ/a
+a href=http://wiki.apache.org/nutch/FAQ;FAQ/a
 /div
 div class=menuitem
-a title= href=http://wiki.apache.org/nutch/;Wiki/a
+a href=http://wiki.apache.org/nutch/;Wiki/a
 /div
 div class=menuitem
-a title= href=tutorial.htmlTutorial/a
+a href=tutorial.htmlTutorial/a
 /div
 div class=menuitem
-a title= href=bot.htmlRobot /a
+a href=bot.htmlRobot /a
 /div
 div class=menuitem
-a title= href=i18n.htmli18n/a
+a href=i18n.htmli18n/a
 /div
 div class=menuitem
-a title= href=apidocs/index.htmlAPI Docs/a
+a href=apidocs/index.htmlAPI Docs/a
 /div
 /div
 div onclick=SwitchMenu('menu_selected_1.3', 'skin/') 
id=menu_selected_1.3Title class=menutitle style=background-image: 

[Nutch Wiki] Update of Website Update HOWTO by JakeVanderdray

2006-02-21 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by JakeVanderdray:
http://wiki.apache.org/nutch/Website_Update_HOWTO

--
- 
- ''This page is prepared for Nutch committers. You need committer rights to 
modify Nutch Website.''
- 
  == What you need ==
  
  [http://forrest.apache.org Apache Forrest] - Publishing framework used for 
Nutch Website maintenance. 
@@ -14, +11 @@

1. Run {{{forrest}}} to build current version of documentation. If the 
build was successful it means Forrest was correctly installed and generated 
site is in {{{src/site/build/site}}} directory.
1. Modify files in {{{src/site/src}}} (mainly in 
{{{src/site/src/documentation/content/xdocs}}}). Run {{{forrest}}} in 
{{{/src/site}}} and review the changes after build.
  
+ If you aren't a committer for this project, you now need to follow the 
instruction in HowToContribute to get your changes updated on the site.  If you 
are a committer, it's time to deploy the site.
+ 
  == How to deploy the site ==
  
1. When you are finally happy with your changes copy files from 
{{{src/site/build/site}}} directory to {{{site}}} and commit them to SVN.


[Nutch Wiki] Update of Website Update HOWTO by JakeVanderdray

2006-02-21 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by JakeVanderdray:
http://wiki.apache.org/nutch/Website_Update_HOWTO

--
1. Run {{{forrest}}} to build current version of documentation. If the 
build was successful it means Forrest was correctly installed and generated 
site is in {{{src/site/build/site}}} directory.
1. Modify files in {{{src/site/src}}} (mainly in 
{{{src/site/src/documentation/content/xdocs}}}). Run {{{forrest}}} in 
{{{/src/site}}} and review the changes after build.
  
- If you aren't a committer for this project, you now need to follow the 
instruction in HowToContribute to get your changes updated on the site.  If you 
are a committer, it's time to deploy the site.
+ If you aren't a committer for this project, you now need to follow the 
instructions in HowToContribute to get your changes applied to the site.  
You'll specifically want to read the sections on Creating a patch and 
Proposing your work.  If you are a committer, it's time to deploy the site.
  
  == How to deploy the site ==
  


svn commit: r379511 - in /lucene/nutch/trunk: docs/fr/help.html src/web/pages/fr/help.xml

2006-02-21 Thread jerome
Author: jerome
Date: Tue Feb 21 08:05:07 2006
New Revision: 379511

URL: http://svn.apache.org/viewcvs?rev=379511view=rev
Log:
Add fr help page

Added:
lucene/nutch/trunk/docs/fr/help.html   (with props)
lucene/nutch/trunk/src/web/pages/fr/help.xml   (with props)

Added: lucene/nutch/trunk/docs/fr/help.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/fr/help.html?rev=379511view=auto
==
--- lucene/nutch/trunk/docs/fr/help.html (added)
+++ lucene/nutch/trunk/docs/fr/help.html Tue Feb 21 08:05:07 2006
@@ -0,0 +1,161 @@
+!DOCTYPE HTML PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN
+html!--This page is automatically generated.  Do not edit!--
+head
+META http-equiv=Content-Type content=text/html; charset=UTF-8
+titleNutch: aide/title
+style type=text/css
+.menuTd {background-color: #F9F7F4; height: 25; onMouseOver: 
this.style.backgroundColor='#ECE5DC';}
+.menuTdhover {background-color: #ECE5DC; height: 25; onMouseOver: 
this.style.backgroundColor='#ECE5DC';}
+.menuEntry {  font-family: Arial, Helvetica, sans-serif; font-size: 12px; 
color: #00; text-decoration: none}
+.body {background-color: #F9F7F4;}
+.bodytext {  font-family: Arial, Helvetica, sans-serif; font-size: 12px; 
color: #00; text-decoration: none}
+.title {  font-family: Arial, Helvetica, sans-serif; font-size: 26px; color: 
#FF9900; text-decoration: none}
+.intro {  font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: 
#FF9900; text-decoration: none}
+.orangeTd {background-color: #FF9900}
+ul {list-style-image: url(../img/reiter/ul.gif)}
+h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: 
#00;}
+h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: 
#00;}
+.url {color: #996600;}
+.highlight {font-weight: bold;}
+.ellipsis {font-weight: bold;}
+/style
+link type=image/x-icon href=../img/favicon.ico rel=icon
+link type=image/x-icon href=../img/favicon.ico rel=shortcut icon
+script type=text/javascript
+!--
+function queryfocus() { document.search.query.focus(); }
+// --/script
+/head
+body onLoad=queryfocus();
+!--This file is automatically generated.  Do not edit!--
+table cellspacing=0 cellpadding=0 border=0 width=635
+tr
+td rowspan=2 width=140 valign=bottoma href=./img border=0 
src=../img/reiter/logo_nutch.gif/aimg height=1 width=140 
src=../img/reiter/spacer_66.gif/td
+/tr
+tr
+td align=right valign=bottom width=495
+table width=495 cellspacing=0 cellpadding=0 border=0
+tr
+td width=400 background=../img/reiter/_bg_reiter.gifnbsp;/tdtd 
width=10 valign=bottom height=28img border=0 
src=../img/reiter/reiter_inactive_le1.gif/tdtd nowrap=nowrap 
valign=bottom background=../img/reiter/_bg_reiter_inactive.gifa 
href=about.html class=bodytextA propos/a/tdtd width=10 
valign=bottom height=28img border=0 
src=../img/reiter/reiter_inactive_ri.gif/tdtd width=10 valign=bottom 
height=28img border=0 src=../img/reiter/reiter_inactive_le.gif/tdtd 
nowrap=nowrap valign=bottom 
background=../img/reiter/_bg_reiter_inactive.gifa 
href=http://wiki.apache.org/nutch/FAQ; class=bodytextQuestions 
freacute;quentes/a/tdtd width=10 valign=bottom height=28img 
border=0 src=../img/reiter/reiter_inactive_ri.gif/td
+/tr
+/table
+/td
+/tr
+/table
+table cellspacing=0 cellpadding=0 border=0 width=635
+tr valign=top
+td width=140
+table cellspacing=0 cellpadding=0 width=100%
+tr
+td#160;/td
+/tr
+/table
+/tdtd background=../img/reiter/_spacer_cc.gif 
width=20#160;/tdtd class=body width=475
+table cellspacing=0 cellpadding=0 border=0 width=475
+tr
+td valign=bottom width=275 height=125 class=titleaide/tdtd 
valign=bottom width=200 height=125img 
src=../img/reiter/robots.gif/td
+/tr
+/table
+br class=br
+span class=bodytext
+
+/spanspan class=bodytext
+h3Requecirc;tes/h3
+/spanspan class=bodytext
+Pour effectuer une recherche avec Nutch, il suffit de saisir quelques mots.
+/spanspan class=bodytext
+ul
+  
+liLes reacute;sultats contiendront uniquement les pages qui contiennent
+span style=font-style: italic;tous/span les mots de la question./li
+  
+liUtilisez les doubles chevrons autour des termes qui doivent ecirc;tre 
adjacents,
+comme dans le cas d'une phrase. Par exemple span style=font-weight: bold;
+Nouvelle Zeacute;lande/span./li
+  
+liLa ponctuation entre les mots est ignoreacute;e. Ainsi, la recherche 
+span style=font-weight: bold;http://www.nutch.org//span est 
eacute;quivalente agrave;
+span style=font-weight: bold;http www nutch org/span./li
+  
+liLa recherche n'est pas sensible agrave; la casse. Ainsi, la recherche
+span style=font-weight: bold;NuTcH/span est eacute;quivalente agrave; 
span style=font-weight: bold;nUtCh/span./li
+  
+liVous pouvez exclure un mot des reacute;sultats en placcedil;ant un signe 
moins
+devant. Ainsi, la recherche span style=font-weight: bold;football
+-nfl/span retournera les pages qui parlent de football, mais qui ne 
contiennent
+pas le mot nfl./li
+
+/ul