svn commit: r233492 - in /lucene/nutch/trunk: conf/ src/plugin/ src/plugin/clustering-carrot2/ src/plugin/creativecommons/ src/plugin/index-basic/ src/plugin/index-more/ src/plugin/languageidentifier/ src/plugin/nutch-extensionpoints/ src/plugin/nutch-...
Author: jerome Date: Fri Aug 19 08:55:46 2005 New Revision: 233492 URL: http://svn.apache.org/viewcvs?rev=233492view=rev Log: NUTCH-10, extension points defined only once (Stefan Grroschupf) Added: lucene/nutch/trunk/src/plugin/nutch-extensionpoints/ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/build.xml (with props) lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml (with props) lucene/nutch/trunk/src/plugin/nutch-extensionpoints/src/ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/src/java/ Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml lucene/nutch/trunk/src/plugin/index-basic/plugin.xml lucene/nutch/trunk/src/plugin/index-more/plugin.xml lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml lucene/nutch/trunk/src/plugin/ontology/plugin.xml lucene/nutch/trunk/src/plugin/parse-ext/plugin.xml lucene/nutch/trunk/src/plugin/parse-html/plugin.xml lucene/nutch/trunk/src/plugin/parse-js/plugin.xml lucene/nutch/trunk/src/plugin/parse-mp3/plugin.xml lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml lucene/nutch/trunk/src/plugin/parse-rtf/plugin.xml lucene/nutch/trunk/src/plugin/parse-text/plugin.xml lucene/nutch/trunk/src/plugin/protocol-file/plugin.xml lucene/nutch/trunk/src/plugin/protocol-ftp/plugin.xml lucene/nutch/trunk/src/plugin/protocol-http/plugin.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml lucene/nutch/trunk/src/plugin/query-basic/plugin.xml lucene/nutch/trunk/src/plugin/query-more/plugin.xml lucene/nutch/trunk/src/plugin/query-site/plugin.xml lucene/nutch/trunk/src/plugin/query-url/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=233492r1=233491r2=233492view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Fri Aug 19 08:55:46 2005 @@ -587,9 +587,10 @@ property nameplugin.includes/name - valueprotocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value + valuenutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)/value descriptionRegular expression naming plugin directory names to - include. Any plugin not matching this expression is excluded. By + include. Any plugin not matching this expression is excluded. + In any case you need at least include the nutch-extensionpoints plugin. By default Nutch includes crawling just HTML and plain text via HTTP, and basic indexing and search plugins. /description Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=233492r1=233491r2=233492view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Fri Aug 19 08:55:46 2005 @@ -6,6 +6,7 @@ !-- Build deploy all the plugin jars.-- !-- == -- target name=deploy + ant dir=nutch-extensionpoints target=deploy/ ant dir=protocol-file target=deploy/ ant dir=protocol-ftp target=deploy/ ant dir=protocol-http target=deploy/ @@ -54,6 +55,7 @@ !-- Clean all of the plugins. -- !-- == -- target name=clean +ant dir=nutch-extensionpoints target=clean/ ant dir=protocol-file target=clean/ ant dir=protocol-ftp target=clean/ ant dir=protocol-http target=clean/ Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=233492r1=233491r2=233492view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Fri Aug 19 08:55:46 2005 @@ -5,10 +5,6 @@ version=0.9.0 provider-name=carrot2.sourceforge.net - extension-point - id=org.apache.nutch.clustering.OnlineClusterer - name=Nutch Online Search Results Clustering Plugin/ - runtime library name=clustering-carrot2.jar export name=*/ Modified:
svn commit: r233544 - /lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
Author: jerome Date: Fri Aug 19 12:26:14 2005 New Revision: 233544 URL: http://svn.apache.org/viewcvs?rev=233544view=rev Log: Correction in LanguageIdentifier unit test Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java?rev=233544r1=233543r2=233544view=diff == --- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java Fri Aug 19 12:26:14 2005 @@ -223,7 +223,7 @@ String testLine = null; while((testLine = testFile.readLine()) != null) { testLine = testLine.trim(); -if (testLine.length() 64) { +if (testLine.length() 256) { lang = idfr.identify(testLine); assertEquals(tokens[1], lang); }
svn commit: r233559 - in /lucene/nutch/trunk/src: java/org/apache/nutch/parse/ plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/ plugi...
Author: jerome Date: Fri Aug 19 14:15:02 2005 New Revision: 233559 URL: http://svn.apache.org/viewcvs?rev=233559view=rev Log: * Add utility to extract urls from plain text (Stephan Strittmatter) * Uses the OutlinkExtractor in parse plugins PDF, MSWord, Text, RTF, Ext Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (with props) lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java (with props) Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=233559view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Fri Aug 19 14:15:02 2005 @@ -0,0 +1,227 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import org.apache.nutch.util.LogFormatter; +import org.apache.oro.text.regex.MatchResult; +import org.apache.oro.text.regex.Pattern; +import org.apache.oro.text.regex.PatternCompiler; +import org.apache.oro.text.regex.PatternMatcher; +import org.apache.oro.text.regex.PatternMatcherInput; +import org.apache.oro.text.regex.Perl5Compiler; +import org.apache.oro.text.regex.Perl5Matcher; + +/** + * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s + * / URLs from plain text using Regular Expressions. + * + * @see a + * href=http://wiki.java.net/bin/view/Javapedia/RegularExpressions;Comparison + * of different regexp-Implementations /a + * @see a href=http://regex.info/java.html;Overview about Java Regexp APIs + * /a + * + * @author Stephan Strittmatter - http://www.sybit.de + * @version 1.0 + * @since 0.7 + */ +public class OutlinkExtractor { + private static final Logger LOG = LogFormatter + .getLogger(OutlinkExtractor.class.getName()); + + /** + * Regex pattern to get URLs within a plain text. + * + * @see a + * href=http://www.truerwords.net/articles/ut/urlactivation.html;http://www.truerwords.net/articles/ut/urlactivation.html + * /a + */ + private static final String URL_PATTERN = + ([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@~=%-]*))?); + + /** + * Extracts codeOutlink/code from given plain text. + * + * @param plainText the plain text from wich URLs should be extracted. + * + * @return Array of codeOutlink/codes within found in plainText + */ + public static Outlink[] getOutlinks(final String plainText) { +return OutlinkExtractor.getOutlinks(plainText, ); + } + + /** + * Extracts codeOutlink/code from given plain text and adds anchor + * to the extracted codeOutlink/codes + * + * @param plainText the plain text from wich URLs should be extracted. + * @param anchorthe anchor of the url + * + * @return Array of codeOutlink/codes within found in plainText + */ + public static Outlink[] getOutlinks(final String plainText, String anchor) { + +final List outlinks = new ArrayList(); + +try { + final PatternCompiler cp = new Perl5Compiler(); + final Pattern pattern = cp.compile(URL_PATTERN, + Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK + | Perl5Compiler.MULTILINE_MASK); + final PatternMatcher matcher = new Perl5Matcher(); + + final PatternMatcherInput input = new PatternMatcherInput(plainText); + + MatchResult result; + String url; + + //loop the matches + while (matcher.contains(input, pattern)) { +result = matcher.getMatch(); +
svn commit: r233569 - /lucene/nutch/branches/mapred/bin/nutch-daemon.sh
Author: cutting Date: Fri Aug 19 15:54:04 2005 New Revision: 233569 URL: http://svn.apache.org/viewcvs?rev=233569view=rev Log: Fix to sync whole tree. Modified: lucene/nutch/branches/mapred/bin/nutch-daemon.sh Modified: lucene/nutch/branches/mapred/bin/nutch-daemon.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch-daemon.sh?rev=233569r1=233568r2=233569view=diff == --- lucene/nutch/branches/mapred/bin/nutch-daemon.sh (original) +++ lucene/nutch/branches/mapred/bin/nutch-daemon.sh Fri Aug 19 15:54:04 2005 @@ -57,7 +57,7 @@ root=`dirname $this`/.. if [ $NUTCH_MASTER != ]; then echo rsync from $NUTCH_MASTER - rsync -a --delete --exclude=.svn $NUTCH_MASTER/{build,bin,lib,conf} $root + rsync -a --delete --exclude=.svn $NUTCH_MASTER/ $root fi cd $root