svn commit: r233492 - in /lucene/nutch/trunk: conf/ src/plugin/ src/plugin/clustering-carrot2/ src/plugin/creativecommons/ src/plugin/index-basic/ src/plugin/index-more/ src/plugin/languageidentifier/ src/plugin/nutch-extensionpoints/ src/plugin/nutch-...

2005-08-19 Thread jerome
Author: jerome
Date: Fri Aug 19 08:55:46 2005
New Revision: 233492

URL: http://svn.apache.org/viewcvs?rev=233492view=rev
Log:
NUTCH-10, extension points defined only once (Stefan Grroschupf)

Added:
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/build.xml   (with props)
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml   (with 
props)
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/src/
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/src/java/
Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml
lucene/nutch/trunk/src/plugin/index-basic/plugin.xml
lucene/nutch/trunk/src/plugin/index-more/plugin.xml
lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml
lucene/nutch/trunk/src/plugin/ontology/plugin.xml
lucene/nutch/trunk/src/plugin/parse-ext/plugin.xml
lucene/nutch/trunk/src/plugin/parse-html/plugin.xml
lucene/nutch/trunk/src/plugin/parse-js/plugin.xml
lucene/nutch/trunk/src/plugin/parse-mp3/plugin.xml
lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml
lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rtf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-text/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-file/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-ftp/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-http/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
lucene/nutch/trunk/src/plugin/query-basic/plugin.xml
lucene/nutch/trunk/src/plugin/query-more/plugin.xml
lucene/nutch/trunk/src/plugin/query-site/plugin.xml
lucene/nutch/trunk/src/plugin/query-url/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=233492r1=233491r2=233492view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Fri Aug 19 08:55:46 2005
@@ -587,9 +587,10 @@
 
 property
   nameplugin.includes/name
-  
valueprotocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value
+  
valuenutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)/value
   descriptionRegular expression naming plugin directory names to
-  include.  Any plugin not matching this expression is excluded.  By
+  include.  Any plugin not matching this expression is excluded.
+  In any case you need at least include the nutch-extensionpoints plugin. By
   default Nutch includes crawling just HTML and plain text via HTTP,
   and basic indexing and search plugins.
   /description

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=233492r1=233491r2=233492view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Aug 19 08:55:46 2005
@@ -6,6 +6,7 @@
   !-- Build  deploy all the plugin jars.--
   !-- == --
   target name=deploy
+ ant dir=nutch-extensionpoints target=deploy/
  ant dir=protocol-file target=deploy/
  ant dir=protocol-ftp target=deploy/
  ant dir=protocol-http target=deploy/
@@ -54,6 +55,7 @@
   !-- Clean all of the plugins.  --
   !-- == --
   target name=clean
+ant dir=nutch-extensionpoints target=clean/
 ant dir=protocol-file target=clean/
 ant dir=protocol-ftp target=clean/
 ant dir=protocol-http target=clean/

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=233492r1=233491r2=233492view=diff
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Fri Aug 19 
08:55:46 2005
@@ -5,10 +5,6 @@
version=0.9.0

provider-name=carrot2.sourceforge.net

 

-   extension-point

-  id=org.apache.nutch.clustering.OnlineClusterer

-  name=Nutch Online Search Results Clustering Plugin/

-

runtime

   library name=clustering-carrot2.jar

  export name=*/


Modified: 

svn commit: r233544 - /lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java

2005-08-19 Thread jerome
Author: jerome
Date: Fri Aug 19 12:26:14 2005
New Revision: 233544

URL: http://svn.apache.org/viewcvs?rev=233544view=rev
Log:
Correction in LanguageIdentifier unit test

Modified:

lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java?rev=233544r1=233543r2=233544view=diff
==
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
 Fri Aug 19 12:26:14 2005
@@ -223,7 +223,7 @@
 String testLine = null;
 while((testLine = testFile.readLine()) != null) {
 testLine = testLine.trim();
-if (testLine.length()  64) {
+if (testLine.length()  256) {
 lang = idfr.identify(testLine);
 assertEquals(tokens[1], lang);
 }




svn commit: r233559 - in /lucene/nutch/trunk/src: java/org/apache/nutch/parse/ plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/ plugi...

2005-08-19 Thread jerome
Author: jerome
Date: Fri Aug 19 14:15:02 2005
New Revision: 233559

URL: http://svn.apache.org/viewcvs?rev=233559view=rev
Log:
* Add utility to extract urls from plain text (Stephan Strittmatter)
* Uses the OutlinkExtractor in parse plugins PDF, MSWord, Text, RTF, Ext

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java   
(with props)

lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java   
(with props)
Modified:

lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java

lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java

lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java

lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java

Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=233559view=auto
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Fri Aug 19 14:15:02 2005
@@ -0,0 +1,227 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the License);
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.apache.nutch.util.LogFormatter;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+
+/**
+ * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s 
+ * / URLs from plain text using Regular Expressions.
+ * 
+ * @see a
+ *  
href=http://wiki.java.net/bin/view/Javapedia/RegularExpressions;Comparison
+ *  of different regexp-Implementations /a
+ * @see a href=http://regex.info/java.html;Overview about Java Regexp APIs
+ *  /a
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @version 1.0
+ * @since 0.7
+ */
+public class OutlinkExtractor {
+  private static final Logger LOG = LogFormatter
+  .getLogger(OutlinkExtractor.class.getName());
+
+  /**
+   * Regex pattern to get URLs within a plain text.
+   * 
+   * @see a
+   *  
href=http://www.truerwords.net/articles/ut/urlactivation.html;http://www.truerwords.net/articles/ut/urlactivation.html
+   *  /a
+   */
+  private static final String URL_PATTERN = 
+
([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@~=%-]*))?);
+
+  /**
+   * Extracts codeOutlink/code from given plain text.
+   * 
+   * @param plainText  the plain text from wich URLs should be extracted.
+   * 
+   * @return Array of codeOutlink/codes within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText) {
+return OutlinkExtractor.getOutlinks(plainText, );
+  }
+
+  /**
+   * Extracts codeOutlink/code from given plain text and adds anchor
+   * to the extracted codeOutlink/codes
+   * 
+   * @param plainText the plain text from wich URLs should be extracted.
+   * @param anchorthe anchor of the url
+   * 
+   * @return Array of codeOutlink/codes within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText, String anchor) {
+
+final List outlinks = new ArrayList();
+
+try {
+  final PatternCompiler cp = new Perl5Compiler();
+  final Pattern pattern = cp.compile(URL_PATTERN,
+  Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+  | Perl5Compiler.MULTILINE_MASK);
+  final PatternMatcher matcher = new Perl5Matcher();
+
+  final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+  MatchResult result;
+  String url;
+
+  //loop the matches
+  while (matcher.contains(input, pattern)) {
+result = matcher.getMatch();
+  

svn commit: r233569 - /lucene/nutch/branches/mapred/bin/nutch-daemon.sh

2005-08-19 Thread cutting
Author: cutting
Date: Fri Aug 19 15:54:04 2005
New Revision: 233569

URL: http://svn.apache.org/viewcvs?rev=233569view=rev
Log:
Fix to sync whole tree.

Modified:
lucene/nutch/branches/mapred/bin/nutch-daemon.sh

Modified: lucene/nutch/branches/mapred/bin/nutch-daemon.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch-daemon.sh?rev=233569r1=233568r2=233569view=diff
==
--- lucene/nutch/branches/mapred/bin/nutch-daemon.sh (original)
+++ lucene/nutch/branches/mapred/bin/nutch-daemon.sh Fri Aug 19 15:54:04 2005
@@ -57,7 +57,7 @@
 root=`dirname $this`/..
 if [ $NUTCH_MASTER !=  ]; then
   echo rsync from $NUTCH_MASTER
-  rsync -a --delete --exclude=.svn $NUTCH_MASTER/{build,bin,lib,conf} $root
+  rsync -a --delete --exclude=.svn $NUTCH_MASTER/ $root
 fi
 
 cd $root