[Nutch Wiki] Update of GeoPosition by MatthiasJaekle

2006-02-17 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by MatthiasJaekle:
http://wiki.apache.org/nutch/GeoPosition

--
  
* This plugin is used for local searches at http://www.umkreisfinder.de/.
  
+ 
+ == Installation ==
+ 
+   * Copy the plugin to your crawler and tomcat dir. 
+ 
+   * Activate plugin in nutch.conf
+ 
+   * Add the parameters to nutch-conf and the files you need for your country.
+ 
+   * fetch / parse / index
+ 
+   * Then you must see north / east values in your index if you click on 
explain.
+ 
+   * Try to search
+ 
+ 
  == To do ==
  
* Checking the existing implementation.


svn commit: r378513 - /lucene/nutch/branches/branch-0.7/CHANGES.txt

2006-02-17 Thread pkosiorowski
Author: pkosiorowski
Date: Fri Feb 17 05:58:05 2006
New Revision: 378513

URL: http://svn.apache.org/viewcvs?rev=378513view=rev
Log:
Fixed JUnit test failing due to changes in www.nutch.org.

Modified:
lucene/nutch/branches/branch-0.7/CHANGES.txt

Modified: lucene/nutch/branches/branch-0.7/CHANGES.txt
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=378513r1=378512r2=378513view=diff
==
--- lucene/nutch/branches/branch-0.7/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.7/CHANGES.txt Fri Feb 17 05:58:05 2006
@@ -13,6 +13,10 @@
  
  5. NUTCH-45 - Log corrupt segments in SegmentMergeTool. (Otis Gospodnetic).
 
+ 6. Fixed TestFetcher JUnit test failing due to changes in www.nutch.org
+website.
+
+
 Release 0.7.1 - 2005-10-01
 
  1. Give focus to search query input.




svn commit: r378653 - in /lucene/nutch/trunk/src/plugin/parse-rtf/src: java/org/apache/nutch/parse/rtf/RTFParseFactory.java java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java test/org/apache/n

2006-02-17 Thread jerome
Author: jerome
Date: Fri Feb 17 15:22:55 2006
New Revision: 378653

URL: http://svn.apache.org/viewcvs?rev=378653view=rev
Log:
Adapt parse-rtf to nutch APIs changes (metadata, parse, protocol, ...)

Modified:

lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java

lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java

lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java

Modified: 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=378653r1=378652r2=378653view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
 Fri Feb 17 15:22:55 2006
@@ -13,38 +13,42 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.rtf;
 
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
- .mine
- .mine
-import org.apache.nutch.util.MetadataNames;
-
-===
-import org.apache.nutch.util.NutchConf;
-===
-import org.apache.hadoop.conf.Configuration;
- .r374853
- .r373941
+// JDK imports
 import java.io.ByteArrayInputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.Properties;
 
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
+// RTF Parser imports
+import com.etranslate.tm.processing.rtf.ParseException;
 import com.etranslate.tm.processing.rtf.RTFParser;
 
+
 /**
  * A parser for RTF documents
  * 
  * @author Andy Hedges
  */
-public class RTFParseFactory implements Parser, MetadataNames {
+public class RTFParseFactory implements Parser {
 
   private Configuration conf;
 
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
 byte[] raw = content.getContent();
 Reader reader = new InputStreamReader(new ByteArrayInputStream(raw));
 RTFParserDelegateImpl delegate = new RTFParserDelegateImpl();
@@ -55,28 +59,31 @@
 
 try {
   rtfParser.parse();
-} catch (com.etranslate.tm.processing.rtf.ParseException e) {
-  throw new ParseException(Exception parsing RTF document, e);
+} catch (ParseException e) {
+return new ParseStatus(ParseStatus.FAILED,
+   ParseStatus.FAILED_EXCEPTION,
+   e.toString()).getEmptyParse(conf);
 }
 
-Properties metadata = new Properties();
-metadata.putAll(content.getMetadata());
-metadata.putAll(delegate.getMetaData());
-String title = metadata.getProperty(TITLE);
+Metadata metadata = new Metadata();
+metadata.setAll(delegate.getMetaData());
+String title = metadata.get(DublinCore.TITLE);
 
 if (title != null) {
-//(CM): Why remove the title metadata property here? Even 
-//though it's stored in the ParseData, it still might be useful
-//to have via this properties object?
-//metadata.remove(title);
+  metadata.remove(DublinCore.TITLE);
 } else {
   title = ;
 }
 
 String text = delegate.getText();
 
-return new ParseImpl(text, new ParseData(title, OutlinkExtractor
-.getOutlinks(text, this.conf), metadata));
+return new ParseImpl(text,
+ new ParseData(ParseStatus.STATUS_SUCCESS,
+   title,
+   OutlinkExtractor
+.  getOutlinks(text, this.conf),
+   content.getMetadata(),
+   metadata));
   }
 
   public void setConf(Configuration conf) {

Modified: 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=378653r1=378652r2=378653view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
 (original)

svn commit: r378655 - in /lucene/nutch/trunk/src/plugin: ./ analysis-de/ analysis-fr/ clustering-carrot2/ creativecommons/ index-basic/ index-more/ languageidentifier/ lib-commons-httpclient/ lib-http

2006-02-17 Thread jerome
Author: jerome
Date: Fri Feb 17 15:28:39 2006
New Revision: 378655

URL: http://svn.apache.org/viewcvs?rev=378655view=rev
Log:
Review plugins building and testing

Modified:
lucene/nutch/trunk/src/plugin/analysis-de/build.xml
lucene/nutch/trunk/src/plugin/analysis-fr/build.xml
lucene/nutch/trunk/src/plugin/build-plugin.xml
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
lucene/nutch/trunk/src/plugin/creativecommons/build.xml
lucene/nutch/trunk/src/plugin/index-basic/build.xml
lucene/nutch/trunk/src/plugin/index-more/build.xml
lucene/nutch/trunk/src/plugin/languageidentifier/build.xml
lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml
lucene/nutch/trunk/src/plugin/lib-http/build.xml
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/build.xml
lucene/nutch/trunk/src/plugin/lib-log4j/build.xml
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml
lucene/nutch/trunk/src/plugin/lib-nekohtml/build.xml
lucene/nutch/trunk/src/plugin/lib-parsems/build.xml
lucene/nutch/trunk/src/plugin/ontology/build.xml
lucene/nutch/trunk/src/plugin/parse-ext/build.xml
lucene/nutch/trunk/src/plugin/parse-html/build.xml
lucene/nutch/trunk/src/plugin/parse-js/build.xml
lucene/nutch/trunk/src/plugin/parse-mp3/build.xml
lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
lucene/nutch/trunk/src/plugin/parse-msword/build.xml
lucene/nutch/trunk/src/plugin/parse-pdf/build.xml
lucene/nutch/trunk/src/plugin/parse-rss/build.xml
lucene/nutch/trunk/src/plugin/parse-rtf/build.xml
lucene/nutch/trunk/src/plugin/parse-swf/build.xml
lucene/nutch/trunk/src/plugin/parse-text/build.xml
lucene/nutch/trunk/src/plugin/parse-zip/build.xml
lucene/nutch/trunk/src/plugin/protocol-file/build.xml
lucene/nutch/trunk/src/plugin/protocol-ftp/build.xml
lucene/nutch/trunk/src/plugin/protocol-http/build.xml
lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml
lucene/nutch/trunk/src/plugin/query-basic/build.xml
lucene/nutch/trunk/src/plugin/query-more/build.xml
lucene/nutch/trunk/src/plugin/query-site/build.xml
lucene/nutch/trunk/src/plugin/query-url/build.xml
lucene/nutch/trunk/src/plugin/urlfilter-prefix/build.xml
lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml

Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/build.xml?rev=378655r1=378654r2=378655view=diff
==
--- lucene/nutch/trunk/src/plugin/analysis-de/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/analysis-de/build.xml Fri Feb 17 15:28:39 2006
@@ -4,9 +4,16 @@
 
   import file=../build-plugin.xml/
 
+  !-- Build compilation dependencies --
+  target name=deps-jar
+ant target=compile-core inheritall=false dir=${nutch.root}/
+ant target=jar inheritall=false dir=../lib-lucene-analyzers/
+  /target
+
+  !-- Add compilation dependencies to classpath --
   path id=plugin.deps
-fileset dir=../lib-lucene-analyzers/lib
-  include name=*.jar /
+fileset dir=${nutch.root}/build
+  include name=**/lib-lucene-analyzers/*.jar /
 /fileset
   /path
 

Modified: lucene/nutch/trunk/src/plugin/analysis-fr/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-fr/build.xml?rev=378655r1=378654r2=378655view=diff
==
--- lucene/nutch/trunk/src/plugin/analysis-fr/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/analysis-fr/build.xml Fri Feb 17 15:28:39 2006
@@ -4,9 +4,16 @@
 
   import file=../build-plugin.xml/
 
+  !-- Build compilation dependencies --
+  target name=deps-jar
+ant target=compile-core inheritall=false dir=${nutch.root}/
+ant target=jar inheritall=false dir=../lib-lucene-analyzers/
+  /target
+
+  !-- Add compilation dependencies to classpath --
   path id=plugin.deps
-fileset dir=../lib-lucene-analyzers/lib
-  include name=*.jar /
+fileset dir=${nutch.root}/build
+  include name=**/lib-lucene-analyzers/*.jar /
 /fileset
   /path
 

Modified: lucene/nutch/trunk/src/plugin/build-plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build-plugin.xml?rev=378655r1=378654r2=378655view=diff
==
--- lucene/nutch/trunk/src/plugin/build-plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/build-plugin.xml Fri Feb 17 15:28:39 2006
@@ -68,10 +68,22 @@
   !-- to be overridden by sub-projects -- 
   target name=init-plugin/
 
+  !--
+   ! Used to build plugin compilation dependencies
+   ! (to be overridden by plugins)
+   !--
+  target name=deps-jar/
+
+  !--
+   ! Used to deploy plugin runtime 

svn commit: r378667 - in /lucene/nutch/trunk/src/plugin/parse-mp3/src: java/org/apache/nutch/parse/mp3/MP3Parser.java java/org/apache/nutch/parse/mp3/MetadataCollector.java test/org/apache/nutch/parse

2006-02-17 Thread jerome
Author: jerome
Date: Fri Feb 17 16:23:35 2006
New Revision: 378667

URL: http://svn.apache.org/viewcvs?rev=378667view=rev
Log:
Adapts parse-mp3 to nutch APIs changes (metadata, parse, protocol, ...)

Modified:

lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java

lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java

lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java

Modified: 
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java?rev=378667r1=378666r2=378667view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
 Fri Feb 17 16:23:35 2006
@@ -16,10 +16,14 @@
 
 package org.apache.nutch.parse.mp3;
 
+// JDK imports
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.Iterator;
 
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.Configuration;
+// Java ID3 Tag imports
 import org.farng.mp3.MP3File;
 import org.farng.mp3.TagException;
 import org.farng.mp3.id3.AbstractID3v2;
@@ -27,29 +31,35 @@
 import org.farng.mp3.id3.ID3v1;
 import org.farng.mp3.object.AbstractMP3Object;
 
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.util.Iterator;
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
 
 /**
  * A parser for MP3 audio files
  * @author Andy Hedges
  */
-
 public class MP3Parser implements Parser {
 
   private MetadataCollector metadataCollector;
   private Configuration conf;
 
-  public Parse getParse(Content content) throws ParseException {
-Parse parse = null;
-metadataCollector.putAll(content.getMetadata());
+  public Parse getParse(Content content) {
 
+Parse parse = null;
 byte[] raw = content.getContent();
-
 File tmp = null;
+
 try {
   tmp = File.createTempFile(nutch, .mp3);
   FileOutputStream fos = new FileOutputStream(tmp);
@@ -58,25 +68,31 @@
   MP3File mp3 = new MP3File(tmp);
 
   if (mp3.hasID3v2Tag()) {
-parse = getID3v2Parse(mp3);
+parse = getID3v2Parse(mp3, content.getMetadata());
   } else if (mp3.hasID3v1Tag()) {
-parse = getID3v1Parse(mp3);
+parse = getID3v1Parse(mp3, content.getMetadata());
   } else {
-throw new ParseException(No textual content available);
+return new ParseStatus(ParseStatus.FAILED,
+   ParseStatus.FAILED_MISSING_CONTENT,
+   No textual content 
available).getEmptyParse(conf);
   }
-
-
 } catch (IOException e) {
-  throw new ParseException(Couldn't create temporary file, e);
+  return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_EXCEPTION,
+ Couldn't create temporary file: + 
e).getEmptyParse(conf);
 } catch (TagException e) {
-  throw new ParseException(ID3 Tags could not be parsed, e);
+  return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_EXCEPTION,
+ ID3 Tags could not be parsed: + 
e).getEmptyParse(conf);
 } finally{
   tmp.delete();
 }
 return parse;
   }
 
-  private Parse getID3v1Parse(MP3File mp3) throws MalformedURLException {
+  private Parse getID3v1Parse(MP3File mp3, Metadata contentMeta)
+  throws MalformedURLException {
+
 ID3v1 tag = mp3.getID3v1Tag();
 metadataCollector.notifyProperty(TALB-Text, tag.getAlbum());
 metadataCollector.notifyProperty(TPE1-Text, tag.getArtist());
@@ -84,13 +100,17 @@
 metadataCollector.notifyProperty(TCON-Text, ( + tag.getGenre() + ));
 metadataCollector.notifyProperty(TIT2-Text, tag.getTitle());
 metadataCollector.notifyProperty(TYER-Text, tag.getYear());
-ParseData parseData = new ParseData(metadataCollector.getTitle(),
-metadataCollector.getOutlinks(),
-metadataCollector.getData(), getConf());
+ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+