svn commit: r379403 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/parse/ src/plugin/creativecommons/src/test/org/creativecommons/nutch/ src/plugin/languageidentifier/src/java/org/apache/nu
Author: jerome Date: Tue Feb 21 01:54:21 2006 New Revision: 379403 URL: http://svn.apache.org/viewcvs?rev=379403view=rev Log: NUTCH-140, parse-plugin.xml can now use extension-id and plugin-id Modified: lucene/nutch/trunk/conf/parse-plugins.dtd lucene/nutch/trunk/conf/parse-plugins.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml Modified: lucene/nutch/trunk/conf/parse-plugins.dtd URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=379403r1=379402r2=379403view=diff == --- lucene/nutch/trunk/conf/parse-plugins.dtd (original) +++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Feb 21 01:54:21 2006 @@ -1,7 +1,12 @@ -!ELEMENT parse-plugins (mimeType+) +!ELEMENT parse-plugins (mimeType+,aliases) !ELEMENT mimeType (plugin+) !ATTLIST mimeType name CDATA #REQUIRED !ELEMENT plugin EMPTY !ATTLIST plugin id CDATA #REQUIRED -!ATTLIST plugin order CDATA '' \ No newline at end of file +!ATTLIST plugin order CDATA '' + +!ELEMENT aliases (alias+) +!ELEMENT alias EMPTY +!ATTLIST alias name CDATA #REQUIRED +!ATTLIST alias extension-id CDATA #REQUIRED \ No newline at end of file Modified: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=379403r1=379402r2=379403view=diff == --- lucene/nutch/trunk/conf/parse-plugins.xml (original) +++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Feb 21 01:54:21 2006 @@ -218,4 +218,33 @@ plugin id=parse-ext / /mimeType + !-- alias mappings for parse-xxx names to the actual extension implementation + ids described in each plugin's plugin.xml file -- + aliases + alias name=parse-ext extension-id=ExtParser / + alias name=parse-html + extension-id=org.apache.nutch.parse.html.HtmlParser / + alias name=parse-js extension-id=JSParser / + alias name=parse-mp3 + extension-id=org.apache.nutch.parse.mp3.MP3Parser / + alias name=parse-msexcel + extension-id=org.apache.nutch.parse.msexcel.MSExcelParser / + alias name=parse-mspowerpoint + extension-id=org.apache.nutch.parse.mspowerpoint.MSPowerPointParser / + alias name=parse-msword + extension-id=org.apache.nutch.parse.msword.MSWordParser / + alias name=parse-pdf + extension-id=org.apache.nutch.parse.pdf.PdfParser / + alias name=parse-rss + extension-id=org.apache.nutch.parse.rss.RSSParser / + alias name=parse-rtf + extension-id=org.apache.nutch.parse.rtf.RTFParseFactory / + alias name=parse-swf + extension-id=org.apache.nutch.parse.swf.SWFParser / + alias name=parse-text + extension-id=org.apache.nutch.parse.text.TextParser / + alias name=parse-zip + extension-id=org.apache.nutch.parse.zip.ZipParser / + /aliases + /parse-plugins
svn commit: r379419 - in /lucene/nutch/trunk: site/mailing_lists.html site/mailing_lists.pdf src/site/src/documentation/content/xdocs/mailing_lists.xml
Author: jerome Date: Tue Feb 21 03:10:42 2006 New Revision: 379419 URL: http://svn.apache.org/viewcvs?rev=379419view=rev Log: NUTCH-214, Add a search mailing list archive link (Jake Vanderdray) Modified: lucene/nutch/trunk/site/mailing_lists.html lucene/nutch/trunk/site/mailing_lists.pdf lucene/nutch/trunk/src/site/src/documentation/content/xdocs/mailing_lists.xml Modified: lucene/nutch/trunk/site/mailing_lists.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/mailing_lists.html?rev=379419r1=379418r2=379419view=diff == --- lucene/nutch/trunk/site/mailing_lists.html (original) +++ lucene/nutch/trunk/site/mailing_lists.html Tue Feb 21 03:10:42 2006 @@ -3,7 +3,7 @@ head META http-equiv=Content-Type content=text/html; charset=UTF-8 meta content=Apache Forrest name=Generator -meta name=Forrest-version content=0.6 +meta name=Forrest-version content=0.7 meta name=Forrest-skin-name content=pelt titleNutch Mailing Lists/title link type=text/css href=skin/basic.css rel=stylesheet @@ -16,49 +16,22 @@ body onload=init() script type=text/javascriptndeSetTextSize();/script div id=top -!--+ -|breadtrail -+-- div class=breadtrail a href=http://www.apache.org/;Apache/a gt; a href=http://lucene.apache.org/;Lucene/a gt; a href=http://lucene.apache.org/nutch/;Nutch/ascript src=skin/breadcrumbs.js language=JavaScript type=text/javascript/script /div -!--+ -|header -+-- div class=header -!--+ -|start group logo -+-- div class=grouplogo a href=http://lucene.apache.org/;img class=logoImage alt=Lucene src=http://lucene.apache.org/java/docs/images/lucene_green_150.gif; title=Apache Lucene/a /div -!--+ -|end group logo -+-- -!--+ -|start Project Logo -+-- div class=projectlogo a href=http://lucene.apache.org/nutch/;img class=logoImage alt=Nutch src=images/nutch-logo.gif title=Open Source Web Search Software/a /div -!--+ -|end Project Logo -+-- -!--+ -|start Search -+-- div class=searchbox form action=http://www.google.com/search; method=get class=roundtopsmall -input value=lucene.apache.org name=sitesearch type=hiddeninput onFocus=getBlank (this, 'Search the site with google:'); value=Search the site with google: size=25 name=q id=query type=textnbsp; -input name=Search value=Search type=submit +input value=lucene.apache.org name=sitesearch type=hiddeninput onFocus=getBlank (this, 'Search the site with google'); size=25 name=q id=query type=text value=Search the site with googlenbsp; +input attr=value name=Search value=Search type=submit /form /div -!--+ -|end search -+-- -!--+ -|start Tabs -+-- ul id=tabs li class=current a class=base-selected href=index.htmlMain/a @@ -67,110 +40,83 @@ a class=base-not-selected href=http://wiki.apache.org/nutch/;Wiki/a /li /ul -!--+ -|end Tabs -+-- /div /div div id=main div id=publishedStrip -!--+ -|start Subtabs -+-- div id=level2tabs/div -!--+ -|end Endtabs -+-- -script type=text/javascript language=JavaScript!-- - document.write(Published: + document.lastModified); - // --/script -/div -!--+ -|breadtrail -+-- +script type=text/javascript!-- +document.write(textLast Published:/text + document.lastModified); +// --/script +/div div class=breadtrail nbsp; /div -!--+ -|start Menu, mainarea -+-- -!--+ -|start Menu -+-- div id=menu div onclick=SwitchMenu('menu_1.1', 'skin/') id=menu_1.1Title class=menutitleProject/div div id=menu_1.1 class=menuitemgroup div class=menuitem -a title= href=index.htmlNews/a +a href=index.htmlNews/a /div div class=menuitem -a title= href=about.htmlAbout/a +a href=about.htmlAbout/a /div div class=menuitem -a title= href=credits.htmlCredits/a +a href=credits.htmlCredits/a /div div class=menuitem -a title= href=http://www.cafepress.com/nutch/;Buy Stuff/a +a href=http://www.cafepress.com/nutch/;Buy Stuff/a /div /div div onclick=SwitchMenu('menu_1.2', 'skin/') id=menu_1.2Title class=menutitleDocumentation/div div id=menu_1.2 class=menuitemgroup div class=menuitem -a title= href=http://wiki.apache.org/nutch/FAQ;FAQ/a +a href=http://wiki.apache.org/nutch/FAQ;FAQ/a /div div class=menuitem -a title= href=http://wiki.apache.org/nutch/;Wiki/a +a href=http://wiki.apache.org/nutch/;Wiki/a /div div class=menuitem -a title= href=tutorial.htmlTutorial/a +a href=tutorial.htmlTutorial/a /div div class=menuitem -a title= href=bot.htmlRobot /a +a href=bot.htmlRobot /a /div div class=menuitem -a title= href=i18n.htmli18n/a +a href=i18n.htmli18n/a /div div class=menuitem -a title= href=apidocs/index.htmlAPI Docs/a +a href=apidocs/index.htmlAPI Docs/a /div /div div onclick=SwitchMenu('menu_selected_1.3', 'skin/') id=menu_selected_1.3Title class=menutitle style=background-image:
[Nutch Wiki] Update of Website Update HOWTO by JakeVanderdray
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by JakeVanderdray: http://wiki.apache.org/nutch/Website_Update_HOWTO -- - - ''This page is prepared for Nutch committers. You need committer rights to modify Nutch Website.'' - == What you need == [http://forrest.apache.org Apache Forrest] - Publishing framework used for Nutch Website maintenance. @@ -14, +11 @@ 1. Run {{{forrest}}} to build current version of documentation. If the build was successful it means Forrest was correctly installed and generated site is in {{{src/site/build/site}}} directory. 1. Modify files in {{{src/site/src}}} (mainly in {{{src/site/src/documentation/content/xdocs}}}). Run {{{forrest}}} in {{{/src/site}}} and review the changes after build. + If you aren't a committer for this project, you now need to follow the instruction in HowToContribute to get your changes updated on the site. If you are a committer, it's time to deploy the site. + == How to deploy the site == 1. When you are finally happy with your changes copy files from {{{src/site/build/site}}} directory to {{{site}}} and commit them to SVN.
[Nutch Wiki] Update of Website Update HOWTO by JakeVanderdray
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by JakeVanderdray: http://wiki.apache.org/nutch/Website_Update_HOWTO -- 1. Run {{{forrest}}} to build current version of documentation. If the build was successful it means Forrest was correctly installed and generated site is in {{{src/site/build/site}}} directory. 1. Modify files in {{{src/site/src}}} (mainly in {{{src/site/src/documentation/content/xdocs}}}). Run {{{forrest}}} in {{{/src/site}}} and review the changes after build. - If you aren't a committer for this project, you now need to follow the instruction in HowToContribute to get your changes updated on the site. If you are a committer, it's time to deploy the site. + If you aren't a committer for this project, you now need to follow the instructions in HowToContribute to get your changes applied to the site. You'll specifically want to read the sections on Creating a patch and Proposing your work. If you are a committer, it's time to deploy the site. == How to deploy the site ==
svn commit: r379511 - in /lucene/nutch/trunk: docs/fr/help.html src/web/pages/fr/help.xml
Author: jerome Date: Tue Feb 21 08:05:07 2006 New Revision: 379511 URL: http://svn.apache.org/viewcvs?rev=379511view=rev Log: Add fr help page Added: lucene/nutch/trunk/docs/fr/help.html (with props) lucene/nutch/trunk/src/web/pages/fr/help.xml (with props) Added: lucene/nutch/trunk/docs/fr/help.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/fr/help.html?rev=379511view=auto == --- lucene/nutch/trunk/docs/fr/help.html (added) +++ lucene/nutch/trunk/docs/fr/help.html Tue Feb 21 08:05:07 2006 @@ -0,0 +1,161 @@ +!DOCTYPE HTML PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN +html!--This page is automatically generated. Do not edit!-- +head +META http-equiv=Content-Type content=text/html; charset=UTF-8 +titleNutch: aide/title +style type=text/css +.menuTd {background-color: #F9F7F4; height: 25; onMouseOver: this.style.backgroundColor='#ECE5DC';} +.menuTdhover {background-color: #ECE5DC; height: 25; onMouseOver: this.style.backgroundColor='#ECE5DC';} +.menuEntry { font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: #00; text-decoration: none} +.body {background-color: #F9F7F4;} +.bodytext { font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: #00; text-decoration: none} +.title { font-family: Arial, Helvetica, sans-serif; font-size: 26px; color: #FF9900; text-decoration: none} +.intro { font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: #FF9900; text-decoration: none} +.orangeTd {background-color: #FF9900} +ul {list-style-image: url(../img/reiter/ul.gif)} +h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: #00;} +h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: #00;} +.url {color: #996600;} +.highlight {font-weight: bold;} +.ellipsis {font-weight: bold;} +/style +link type=image/x-icon href=../img/favicon.ico rel=icon +link type=image/x-icon href=../img/favicon.ico rel=shortcut icon +script type=text/javascript +!-- +function queryfocus() { document.search.query.focus(); } +// --/script +/head +body onLoad=queryfocus(); +!--This file is automatically generated. Do not edit!-- +table cellspacing=0 cellpadding=0 border=0 width=635 +tr +td rowspan=2 width=140 valign=bottoma href=./img border=0 src=../img/reiter/logo_nutch.gif/aimg height=1 width=140 src=../img/reiter/spacer_66.gif/td +/tr +tr +td align=right valign=bottom width=495 +table width=495 cellspacing=0 cellpadding=0 border=0 +tr +td width=400 background=../img/reiter/_bg_reiter.gifnbsp;/tdtd width=10 valign=bottom height=28img border=0 src=../img/reiter/reiter_inactive_le1.gif/tdtd nowrap=nowrap valign=bottom background=../img/reiter/_bg_reiter_inactive.gifa href=about.html class=bodytextA propos/a/tdtd width=10 valign=bottom height=28img border=0 src=../img/reiter/reiter_inactive_ri.gif/tdtd width=10 valign=bottom height=28img border=0 src=../img/reiter/reiter_inactive_le.gif/tdtd nowrap=nowrap valign=bottom background=../img/reiter/_bg_reiter_inactive.gifa href=http://wiki.apache.org/nutch/FAQ; class=bodytextQuestions freacute;quentes/a/tdtd width=10 valign=bottom height=28img border=0 src=../img/reiter/reiter_inactive_ri.gif/td +/tr +/table +/td +/tr +/table +table cellspacing=0 cellpadding=0 border=0 width=635 +tr valign=top +td width=140 +table cellspacing=0 cellpadding=0 width=100% +tr +td#160;/td +/tr +/table +/tdtd background=../img/reiter/_spacer_cc.gif width=20#160;/tdtd class=body width=475 +table cellspacing=0 cellpadding=0 border=0 width=475 +tr +td valign=bottom width=275 height=125 class=titleaide/tdtd valign=bottom width=200 height=125img src=../img/reiter/robots.gif/td +/tr +/table +br class=br +span class=bodytext + +/spanspan class=bodytext +h3Requecirc;tes/h3 +/spanspan class=bodytext +Pour effectuer une recherche avec Nutch, il suffit de saisir quelques mots. +/spanspan class=bodytext +ul + +liLes reacute;sultats contiendront uniquement les pages qui contiennent +span style=font-style: italic;tous/span les mots de la question./li + +liUtilisez les doubles chevrons autour des termes qui doivent ecirc;tre adjacents, +comme dans le cas d'une phrase. Par exemple span style=font-weight: bold; +Nouvelle Zeacute;lande/span./li + +liLa ponctuation entre les mots est ignoreacute;e. Ainsi, la recherche +span style=font-weight: bold;http://www.nutch.org//span est eacute;quivalente agrave; +span style=font-weight: bold;http www nutch org/span./li + +liLa recherche n'est pas sensible agrave; la casse. Ainsi, la recherche +span style=font-weight: bold;NuTcH/span est eacute;quivalente agrave; span style=font-weight: bold;nUtCh/span./li + +liVous pouvez exclure un mot des reacute;sultats en placcedil;ant un signe moins +devant. Ainsi, la recherche span style=font-weight: bold;football +-nfl/span retournera les pages qui parlent de football, mais qui ne contiennent +pas le mot nfl./li + +/ul