Author: ab Date: Tue Nov 28 12:14:58 2006 New Revision: 480188 URL: http://svn.apache.org/viewvc?view=rev&rev=480188 Log: Move some constants to Nutch.java, so that Metadata could use them properly.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=480188&r1=480187&r2=480188 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Nov 28 12:14:58 2006 @@ -33,6 +33,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.*; import org.apache.nutch.protocol.*; import org.apache.nutch.parse.*; @@ -45,10 +46,6 @@ public static final Log LOG = LogFactory.getLog(Fetcher.class); - public static final String SIGNATURE_KEY = "nutch.content.digest"; - public static final String SEGMENT_NAME_KEY = "nutch.segment.name"; - public static final String SCORE_KEY = "nutch.crawl.score"; - public static class InputFormat extends SequenceFileInputFormat { /** Don't split inputs, to keep things polite. */ public FileSplit[] getSplits(FileSystem fs, JobConf job, int nSplits) @@ -268,7 +265,7 @@ } Metadata metadata = content.getMetadata(); // add segment to metadata - metadata.set(SEGMENT_NAME_KEY, segmentName); + metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); @@ -297,11 +294,11 @@ // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); - metadata.set(SIGNATURE_KEY, StringUtil.toHexString(signature)); + metadata.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); datum.setSignature(signature); // Ensure segment name and score are in parseData metadata - parse.getData().getContentMeta().set(SEGMENT_NAME_KEY, segmentName); - parse.getData().getContentMeta().set(SIGNATURE_KEY, StringUtil.toHexString(signature)); + parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); + parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); try { scfilters.passScoreAfterParsing(key, content, parse); } catch (Exception e) { @@ -359,7 +356,7 @@ public void configure(JobConf job) { setConf(job); - this.segmentName = job.get(SEGMENT_NAME_KEY); + this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY); this.storingContent = isStoringContent(job); this.parsing = isParsing(job); @@ -430,7 +427,7 @@ job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); - job.set(SEGMENT_NAME_KEY, segment.getName()); + job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=480188&r1=480187&r2=480188 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Tue Nov 28 12:14:58 2006 @@ -47,6 +47,7 @@ import org.apache.lucene.index.*; import org.apache.lucene.document.*; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; /** Create indexes for segments. */ public class Indexer extends ToolBase implements Reducer { @@ -220,11 +221,11 @@ Metadata metadata = parseData.getContentMeta(); // add segment, used to map from merged index back to segment files - doc.add(new Field("segment", metadata.get(Fetcher.SEGMENT_NAME_KEY), + doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY), Field.Store.YES, Field.Index.NO)); // add digest, used by dedup - doc.add(new Field("digest", metadata.get(Fetcher.SIGNATURE_KEY), + doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY), Field.Store.YES, Field.Index.NO)); // if (LOG.isInfoEnabled()) { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?view=diff&rev=480188&r1=480187&r2=480188 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Tue Nov 28 12:14:58 2006 @@ -30,5 +30,11 @@ public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion"; - + + public static final String SIGNATURE_KEY = "nutch.content.digest"; + + public static final String SEGMENT_NAME_KEY = "nutch.segment.name"; + + public static final String SCORE_KEY = "nutch.crawl.score"; + } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diff&rev=480188&r1=480187&r2=480188 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Tue Nov 28 12:14:58 2006 @@ -29,6 +29,7 @@ import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.StringUtil; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.*; import java.io.*; @@ -89,7 +90,7 @@ ParseData parseData = parse.getData(); // recover the signature prepared by Fetcher or ParseSegment - String sig = parseData.getContentMeta().get(Fetcher.SIGNATURE_KEY); + String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY); if (sig != null) { byte[] signature = StringUtil.fromHexString(sig); if (signature != null) { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?view=diff&rev=480188&r1=480187&r2=480188 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue Nov 28 12:14:58 2006 @@ -25,6 +25,7 @@ import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.conf.*; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.protocol.*; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; @@ -80,7 +81,7 @@ // compute the new signature byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); - content.getMetadata().set(Fetcher.SIGNATURE_KEY, StringUtil.toHexString(signature)); + content.getMetadata().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); if (status.isSuccess()) { try { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?view=diff&rev=480188&r1=480187&r2=480188 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue Nov 28 12:14:58 2006 @@ -35,6 +35,7 @@ import org.apache.nutch.fetcher.Fetcher; import org.apache.nutch.metadata.MetaWrapper; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.URLFilters; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseText; @@ -170,9 +171,9 @@ } else if (o instanceof ParseData) { // update the segment name inside contentMeta - required by Indexer if (slice == null) { - ((ParseData)o).getContentMeta().set(Fetcher.SEGMENT_NAME_KEY, segmentName); + ((ParseData)o).getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); } else { - ((ParseData)o).getContentMeta().set(Fetcher.SEGMENT_NAME_KEY, segmentName + "-" + slice); + ((ParseData)o).getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName + "-" + slice); } pd_out = ensureMapFile(slice, ParseData.DIR_NAME, ParseData.class); pd_out.append(key, o); Modified: lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?view=diff&rev=480188&r1=480187&r2=480188 ============================================================================== --- lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original) +++ lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Tue Nov 28 12:14:58 2006 @@ -32,6 +32,7 @@ import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.fetcher.Fetcher; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.protocol.Content; @@ -102,18 +103,18 @@ /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) { - content.getMetadata().set(Fetcher.SCORE_KEY, "" + datum.getScore()); + content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore()); } /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */ public void passScoreAfterParsing(Text url, Content content, Parse parse) { - parse.getData().getContentMeta().set(Fetcher.SCORE_KEY, content.getMetadata().get(Fetcher.SCORE_KEY)); + parse.getData().getContentMeta().set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); } /** Get a float value from Fetcher.SCORE_KEY, divide it by the number of outlinks and apply. */ public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException { float score = scoreInjected; - String scoreString = parseData.getContentMeta().get(Fetcher.SCORE_KEY); + String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY); if (scoreString != null) { try { score = Float.parseFloat(scoreString);