Hi there. Following instructions from this link: http://wiki.apache.org/nutch/RunNutchInEclipse0.9
i checked out Nutch and configured within Eclipse. Then I noticed there were some compilations errors. They were mainly about methods changing their signature. Well I believe they are now fixed and here I'm attaching a patch which fixes.
Index: /home/data/software/java/nutch/nutch-svn/contrib/web2/src/main/java/org/apache/nutch/webapp/common/WebAppModule.java =================================================================== --- /home/data/software/java/nutch/nutch-svn/contrib/web2/src/main/java/org/apache/nutch/webapp/common/WebAppModule.java (revision 638548) +++ /home/data/software/java/nutch/nutch-svn/contrib/web2/src/main/java/org/apache/nutch/webapp/common/WebAppModule.java (working copy) @@ -158,8 +158,8 @@ Element pattern = (Element) mapping.getElementsByTagName("url-pattern") .item(0); - String servletName = servlet.getTextContent().trim(); - String urlPattern = pattern.getTextContent().trim(); + String servletName = servlet.getNodeValue().trim(); + String urlPattern = pattern.getNodeValue().trim(); servlets.put(urlPattern, servletName); urlPatterns.add(urlPattern); Index: /home/data/software/java/nutch/nutch-svn/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java =================================================================== --- /home/data/software/java/nutch/nutch-svn/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java (revision 638548) +++ /home/data/software/java/nutch/nutch-svn/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java (working copy) @@ -51,7 +51,7 @@ setArtist(value); if (name.indexOf("URL Link") > -1) { - links.add(new Outlink(value, "", this.conf)); + links.add(new Outlink(value, "")); } else if (name.indexOf("Text") > -1) { text += value + "\n"; } Index: /home/data/software/java/nutch/nutch-svn/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java =================================================================== --- /home/data/software/java/nutch/nutch-svn/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java (revision 638548) +++ /home/data/software/java/nutch/nutch-svn/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java (working copy) @@ -24,26 +24,21 @@ import java.net.MalformedURLException; import java.util.Iterator; -// Java ID3 Tag imports -import org.farng.mp3.MP3File; -import org.farng.mp3.TagException; -import org.farng.mp3.id3.AbstractID3v2; -import org.farng.mp3.id3.AbstractID3v2Frame; -import org.farng.mp3.id3.ID3v1; -import org.farng.mp3.object.AbstractMP3Object; - -// Hadoop imports import org.apache.hadoop.conf.Configuration; - -// Nutch imports import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseException; import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; +import org.farng.mp3.MP3File; +import org.farng.mp3.TagException; +import org.farng.mp3.id3.AbstractID3v2; +import org.farng.mp3.id3.AbstractID3v2Frame; +import org.farng.mp3.id3.ID3v1; +import org.farng.mp3.object.AbstractMP3Object; /** @@ -55,7 +50,7 @@ private MetadataCollector metadataCollector; private Configuration conf; - public Parse getParse(Content content) { + public ParseResult getParse(Content content) { Parse parse = null; byte[] raw = content.getContent(); @@ -73,22 +68,25 @@ } else if (mp3.hasID3v1Tag()) { parse = getID3v1Parse(mp3, content.getMetadata()); } else { - return new ParseStatus(ParseStatus.FAILED, + parse = new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_MISSING_CONTENT, "No textual content available").getEmptyParse(conf); + return ParseResult.createParseResult(content.getUrl(), parse); } } catch (IOException e) { - return new ParseStatus(ParseStatus.FAILED, + parse = new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_EXCEPTION, "Couldn't create temporary file:" + e).getEmptyParse(conf); + return ParseResult.createParseResult(content.getUrl(), parse); } catch (TagException e) { - return new ParseStatus(ParseStatus.FAILED, + parse = new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_EXCEPTION, "ID3 Tags could not be parsed:" + e).getEmptyParse(conf); + return ParseResult.createParseResult(content.getUrl(), parse); } finally{ tmp.delete(); } - return parse; + return ParseResult.createParseResult(content.getUrl(), parse); } private Parse getID3v1Parse(MP3File mp3, Metadata contentMeta) Index: /home/data/software/java/nutch/nutch-svn/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java =================================================================== --- /home/data/software/java/nutch/nutch-svn/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (revision 638548) +++ /home/data/software/java/nutch/nutch-svn/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (working copy) @@ -17,6 +17,8 @@ package org.apache.nutch.parse.mp3; +import junit.framework.TestCase; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; @@ -23,6 +25,7 @@ import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; @@ -30,8 +33,6 @@ import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.util.NutchConfiguration; -import junit.framework.TestCase; - /** * Unit tests for TestMP3Parser. (Adapted from John Xing msword unit tests). * @@ -65,6 +66,8 @@ Protocol protocol; Content content; Parse parse; + ParseResult parseResult; + Configuration conf = NutchConfiguration.create(); urlString = "file:" + sampleDir + fileSeparator + id3v2; @@ -71,7 +74,9 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) .getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content); + parseResult = new ParseUtil(conf).parseByExtensionId("parse-mp3", content); + parse = parseResult.get(content.getUrl()); + Metadata metadata = parse.getData().getParseMeta(); assertEquals("postgresql comment id3v2", metadata.get("COMM-Text")); assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text")); @@ -96,6 +101,7 @@ String urlString; Protocol protocol; Content content; + ParseResult parseResult; Parse parse; Configuration conf = NutchConfiguration.create(); @@ -103,7 +109,8 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) .getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content); + parseResult = new ParseUtil(conf).parseByExtensionId("parse-mp3", content); + parse = parseResult.get(content.getUrl()); Metadata metadata = parse.getData().getParseMeta(); assertEquals("postgresql comment id3v1", metadata.get("COMM-Text")); @@ -123,6 +130,7 @@ String urlString; Protocol protocol; Content content; + ParseResult parseResult; Parse parse; Configuration conf = NutchConfiguration.create(); @@ -130,7 +138,9 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) .getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content); + parseResult = new ParseUtil(conf).parseByExtensionId("parse-mp3", content); + parse = parseResult.get(content.getUrl()); + // Metadata metadata = parse.getData().getParseMeta(); if (parse.getData().getStatus().isSuccess()) { fail("Expected ParseException"); Index: /home/data/software/java/nutch/nutch-svn/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java =================================================================== --- /home/data/software/java/nutch/nutch-svn/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (revision 638548) +++ /home/data/software/java/nutch/nutch-svn/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (working copy) @@ -31,6 +31,7 @@ import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; @@ -49,7 +50,7 @@ private Configuration conf; - public Parse getParse(Content content) { + public ParseResult getParse(Content content) { byte[] raw = content.getContent(); Reader reader = new InputStreamReader(new ByteArrayInputStream(raw)); RTFParserDelegateImpl delegate = new RTFParserDelegateImpl(); @@ -57,6 +58,8 @@ rtfParser = RTFParser.createParser(reader); rtfParser.setNewLine("\n"); rtfParser.setDelegate(delegate); + + Parse parse = null; try { rtfParser.parse(); @@ -61,9 +64,10 @@ try { rtfParser.parse(); } catch (ParseException e) { - return new ParseStatus(ParseStatus.FAILED, + parse = new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_EXCEPTION, e.toString()).getEmptyParse(conf); + return ParseResult.createParseResult(content.getUrl(), parse); } Metadata metadata = new Metadata(); @@ -78,7 +82,7 @@ String text = delegate.getText(); - return new ParseImpl(text, + parse = new ParseImpl(text, new ParseData(ParseStatus.STATUS_SUCCESS, title, OutlinkExtractor @@ -85,6 +89,7 @@ . getOutlinks(text, this.conf), content.getMetadata(), metadata)); + return ParseResult.createParseResult(content.getUrl(), parse); } public void setConf(Configuration conf) { Index: /home/data/software/java/nutch/nutch-svn/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java =================================================================== --- /home/data/software/java/nutch/nutch-svn/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (revision 638548) +++ /home/data/software/java/nutch/nutch-svn/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (working copy) @@ -25,6 +25,7 @@ import org.apache.nutch.metadata.DublinCore; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseException; import org.apache.nutch.protocol.Content; @@ -69,6 +70,7 @@ Protocol protocol; Content content; Parse parse; + ParseResult parseResult; Configuration conf = NutchConfiguration.create(); urlString = "file:" + sampleDir + fileSeparator + rtfFile; @@ -75,7 +77,10 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) .getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content); + + parseResult = new ParseUtil(conf).parseByExtensionId("parse-rtf", content); + parse = parseResult.get(content.getUrl()); + String text = parse.getText(); assertEquals("The quick brown fox jumps over the lazy dog", text.trim());