[Nutch Wiki] Update of GeoPosition by MatthiasJaekle
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by MatthiasJaekle: http://wiki.apache.org/nutch/GeoPosition -- * This plugin is used for local searches at http://www.umkreisfinder.de/. + + == Installation == + + * Copy the plugin to your crawler and tomcat dir. + + * Activate plugin in nutch.conf + + * Add the parameters to nutch-conf and the files you need for your country. + + * fetch / parse / index + + * Then you must see north / east values in your index if you click on explain. + + * Try to search + + == To do == * Checking the existing implementation.
svn commit: r378513 - /lucene/nutch/branches/branch-0.7/CHANGES.txt
Author: pkosiorowski Date: Fri Feb 17 05:58:05 2006 New Revision: 378513 URL: http://svn.apache.org/viewcvs?rev=378513view=rev Log: Fixed JUnit test failing due to changes in www.nutch.org. Modified: lucene/nutch/branches/branch-0.7/CHANGES.txt Modified: lucene/nutch/branches/branch-0.7/CHANGES.txt URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=378513r1=378512r2=378513view=diff == --- lucene/nutch/branches/branch-0.7/CHANGES.txt (original) +++ lucene/nutch/branches/branch-0.7/CHANGES.txt Fri Feb 17 05:58:05 2006 @@ -13,6 +13,10 @@ 5. NUTCH-45 - Log corrupt segments in SegmentMergeTool. (Otis Gospodnetic). + 6. Fixed TestFetcher JUnit test failing due to changes in www.nutch.org +website. + + Release 0.7.1 - 2005-10-01 1. Give focus to search query input.
svn commit: r378653 - in /lucene/nutch/trunk/src/plugin/parse-rtf/src: java/org/apache/nutch/parse/rtf/RTFParseFactory.java java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java test/org/apache/n
Author: jerome Date: Fri Feb 17 15:22:55 2006 New Revision: 378653 URL: http://svn.apache.org/viewcvs?rev=378653view=rev Log: Adapt parse-rtf to nutch APIs changes (metadata, parse, protocol, ...) Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=378653r1=378652r2=378653view=diff == --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Fri Feb 17 15:22:55 2006 @@ -13,38 +13,42 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.parse.rtf; -import org.apache.nutch.parse.*; -import org.apache.nutch.protocol.Content; - .mine - .mine -import org.apache.nutch.util.MetadataNames; - -=== -import org.apache.nutch.util.NutchConf; -=== -import org.apache.hadoop.conf.Configuration; - .r374853 - .r373941 +// JDK imports import java.io.ByteArrayInputStream; import java.io.InputStreamReader; import java.io.Reader; -import java.util.Properties; +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; + +// RTF Parser imports +import com.etranslate.tm.processing.rtf.ParseException; import com.etranslate.tm.processing.rtf.RTFParser; + /** * A parser for RTF documents * * @author Andy Hedges */ -public class RTFParseFactory implements Parser, MetadataNames { +public class RTFParseFactory implements Parser { private Configuration conf; - public Parse getParse(Content content) throws ParseException { + public Parse getParse(Content content) { byte[] raw = content.getContent(); Reader reader = new InputStreamReader(new ByteArrayInputStream(raw)); RTFParserDelegateImpl delegate = new RTFParserDelegateImpl(); @@ -55,28 +59,31 @@ try { rtfParser.parse(); -} catch (com.etranslate.tm.processing.rtf.ParseException e) { - throw new ParseException(Exception parsing RTF document, e); +} catch (ParseException e) { +return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_EXCEPTION, + e.toString()).getEmptyParse(conf); } -Properties metadata = new Properties(); -metadata.putAll(content.getMetadata()); -metadata.putAll(delegate.getMetaData()); -String title = metadata.getProperty(TITLE); +Metadata metadata = new Metadata(); +metadata.setAll(delegate.getMetaData()); +String title = metadata.get(DublinCore.TITLE); if (title != null) { -//(CM): Why remove the title metadata property here? Even -//though it's stored in the ParseData, it still might be useful -//to have via this properties object? -//metadata.remove(title); + metadata.remove(DublinCore.TITLE); } else { title = ; } String text = delegate.getText(); -return new ParseImpl(text, new ParseData(title, OutlinkExtractor -.getOutlinks(text, this.conf), metadata)); +return new ParseImpl(text, + new ParseData(ParseStatus.STATUS_SUCCESS, + title, + OutlinkExtractor +. getOutlinks(text, this.conf), + content.getMetadata(), + metadata)); } public void setConf(Configuration conf) { Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=378653r1=378652r2=378653view=diff == --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java (original)
svn commit: r378655 - in /lucene/nutch/trunk/src/plugin: ./ analysis-de/ analysis-fr/ clustering-carrot2/ creativecommons/ index-basic/ index-more/ languageidentifier/ lib-commons-httpclient/ lib-http
Author: jerome Date: Fri Feb 17 15:28:39 2006 New Revision: 378655 URL: http://svn.apache.org/viewcvs?rev=378655view=rev Log: Review plugins building and testing Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml lucene/nutch/trunk/src/plugin/analysis-fr/build.xml lucene/nutch/trunk/src/plugin/build-plugin.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml lucene/nutch/trunk/src/plugin/creativecommons/build.xml lucene/nutch/trunk/src/plugin/index-basic/build.xml lucene/nutch/trunk/src/plugin/index-more/build.xml lucene/nutch/trunk/src/plugin/languageidentifier/build.xml lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml lucene/nutch/trunk/src/plugin/lib-http/build.xml lucene/nutch/trunk/src/plugin/lib-jakarta-poi/build.xml lucene/nutch/trunk/src/plugin/lib-log4j/build.xml lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml lucene/nutch/trunk/src/plugin/lib-nekohtml/build.xml lucene/nutch/trunk/src/plugin/lib-parsems/build.xml lucene/nutch/trunk/src/plugin/ontology/build.xml lucene/nutch/trunk/src/plugin/parse-ext/build.xml lucene/nutch/trunk/src/plugin/parse-html/build.xml lucene/nutch/trunk/src/plugin/parse-js/build.xml lucene/nutch/trunk/src/plugin/parse-mp3/build.xml lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml lucene/nutch/trunk/src/plugin/parse-msword/build.xml lucene/nutch/trunk/src/plugin/parse-pdf/build.xml lucene/nutch/trunk/src/plugin/parse-rss/build.xml lucene/nutch/trunk/src/plugin/parse-rtf/build.xml lucene/nutch/trunk/src/plugin/parse-swf/build.xml lucene/nutch/trunk/src/plugin/parse-text/build.xml lucene/nutch/trunk/src/plugin/parse-zip/build.xml lucene/nutch/trunk/src/plugin/protocol-file/build.xml lucene/nutch/trunk/src/plugin/protocol-ftp/build.xml lucene/nutch/trunk/src/plugin/protocol-http/build.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml lucene/nutch/trunk/src/plugin/query-basic/build.xml lucene/nutch/trunk/src/plugin/query-more/build.xml lucene/nutch/trunk/src/plugin/query-site/build.xml lucene/nutch/trunk/src/plugin/query-url/build.xml lucene/nutch/trunk/src/plugin/urlfilter-prefix/build.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/build.xml?rev=378655r1=378654r2=378655view=diff == --- lucene/nutch/trunk/src/plugin/analysis-de/build.xml (original) +++ lucene/nutch/trunk/src/plugin/analysis-de/build.xml Fri Feb 17 15:28:39 2006 @@ -4,9 +4,16 @@ import file=../build-plugin.xml/ + !-- Build compilation dependencies -- + target name=deps-jar +ant target=compile-core inheritall=false dir=${nutch.root}/ +ant target=jar inheritall=false dir=../lib-lucene-analyzers/ + /target + + !-- Add compilation dependencies to classpath -- path id=plugin.deps -fileset dir=../lib-lucene-analyzers/lib - include name=*.jar / +fileset dir=${nutch.root}/build + include name=**/lib-lucene-analyzers/*.jar / /fileset /path Modified: lucene/nutch/trunk/src/plugin/analysis-fr/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-fr/build.xml?rev=378655r1=378654r2=378655view=diff == --- lucene/nutch/trunk/src/plugin/analysis-fr/build.xml (original) +++ lucene/nutch/trunk/src/plugin/analysis-fr/build.xml Fri Feb 17 15:28:39 2006 @@ -4,9 +4,16 @@ import file=../build-plugin.xml/ + !-- Build compilation dependencies -- + target name=deps-jar +ant target=compile-core inheritall=false dir=${nutch.root}/ +ant target=jar inheritall=false dir=../lib-lucene-analyzers/ + /target + + !-- Add compilation dependencies to classpath -- path id=plugin.deps -fileset dir=../lib-lucene-analyzers/lib - include name=*.jar / +fileset dir=${nutch.root}/build + include name=**/lib-lucene-analyzers/*.jar / /fileset /path Modified: lucene/nutch/trunk/src/plugin/build-plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build-plugin.xml?rev=378655r1=378654r2=378655view=diff == --- lucene/nutch/trunk/src/plugin/build-plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/build-plugin.xml Fri Feb 17 15:28:39 2006 @@ -68,10 +68,22 @@ !-- to be overridden by sub-projects -- target name=init-plugin/ + !-- + ! Used to build plugin compilation dependencies + ! (to be overridden by plugins) + !-- + target name=deps-jar/ + + !-- + ! Used to deploy plugin runtime
svn commit: r378667 - in /lucene/nutch/trunk/src/plugin/parse-mp3/src: java/org/apache/nutch/parse/mp3/MP3Parser.java java/org/apache/nutch/parse/mp3/MetadataCollector.java test/org/apache/nutch/parse
Author: jerome Date: Fri Feb 17 16:23:35 2006 New Revision: 378667 URL: http://svn.apache.org/viewcvs?rev=378667view=rev Log: Adapts parse-mp3 to nutch APIs changes (metadata, parse, protocol, ...) Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java?rev=378667r1=378666r2=378667view=diff == --- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java Fri Feb 17 16:23:35 2006 @@ -16,10 +16,14 @@ package org.apache.nutch.parse.mp3; +// JDK imports +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.Iterator; -import org.apache.nutch.parse.*; -import org.apache.nutch.protocol.Content; -import org.apache.hadoop.conf.Configuration; +// Java ID3 Tag imports import org.farng.mp3.MP3File; import org.farng.mp3.TagException; import org.farng.mp3.id3.AbstractID3v2; @@ -27,29 +31,35 @@ import org.farng.mp3.id3.ID3v1; import org.farng.mp3.object.AbstractMP3Object; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.net.MalformedURLException; -import java.util.Iterator; +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; + /** * A parser for MP3 audio files * @author Andy Hedges */ - public class MP3Parser implements Parser { private MetadataCollector metadataCollector; private Configuration conf; - public Parse getParse(Content content) throws ParseException { -Parse parse = null; -metadataCollector.putAll(content.getMetadata()); + public Parse getParse(Content content) { +Parse parse = null; byte[] raw = content.getContent(); - File tmp = null; + try { tmp = File.createTempFile(nutch, .mp3); FileOutputStream fos = new FileOutputStream(tmp); @@ -58,25 +68,31 @@ MP3File mp3 = new MP3File(tmp); if (mp3.hasID3v2Tag()) { -parse = getID3v2Parse(mp3); +parse = getID3v2Parse(mp3, content.getMetadata()); } else if (mp3.hasID3v1Tag()) { -parse = getID3v1Parse(mp3); +parse = getID3v1Parse(mp3, content.getMetadata()); } else { -throw new ParseException(No textual content available); +return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_MISSING_CONTENT, + No textual content available).getEmptyParse(conf); } - - } catch (IOException e) { - throw new ParseException(Couldn't create temporary file, e); + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_EXCEPTION, + Couldn't create temporary file: + e).getEmptyParse(conf); } catch (TagException e) { - throw new ParseException(ID3 Tags could not be parsed, e); + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_EXCEPTION, + ID3 Tags could not be parsed: + e).getEmptyParse(conf); } finally{ tmp.delete(); } return parse; } - private Parse getID3v1Parse(MP3File mp3) throws MalformedURLException { + private Parse getID3v1Parse(MP3File mp3, Metadata contentMeta) + throws MalformedURLException { + ID3v1 tag = mp3.getID3v1Tag(); metadataCollector.notifyProperty(TALB-Text, tag.getAlbum()); metadataCollector.notifyProperty(TPE1-Text, tag.getArtist()); @@ -84,13 +100,17 @@ metadataCollector.notifyProperty(TCON-Text, ( + tag.getGenre() + )); metadataCollector.notifyProperty(TIT2-Text, tag.getTitle()); metadataCollector.notifyProperty(TYER-Text, tag.getYear()); -ParseData parseData = new ParseData(metadataCollector.getTitle(), -metadataCollector.getOutlinks(), -metadataCollector.getData(), getConf()); +ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, +