Author: markus Date: Tue Jul 2 08:36:13 2013 New Revision: 1498830 URL: http://svn.apache.org/r1498830 Log: NUTCH-1327 QueryStringNormalizer
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1498830&r1=1498829&r2=1498830&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Jul 2 08:36:13 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1327 QueryStringNormalizer (markus) + * NUTCH-1593 Normalize option missing in SegmentMerger's usage (markus) * NUTCH-1580 index-static returns object instead of value for index.static (Antoinette, lewismc, snagel) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1498830&r1=1498829&r2=1498830&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Tue Jul 2 08:36:13 2013 @@ -24,6 +24,7 @@ import java.net.URL; import java.util.Date; import java.util.Iterator; import java.util.Map; +import java.util.Map.Entry; import java.util.Random; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -41,6 +42,7 @@ import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; @@ -64,42 +66,41 @@ import org.apache.nutch.util.StringUtil; /** * Read utility for the CrawlDB. - * + * * @author Andrzej Bialecki - * + * */ public class CrawlDbReader implements Closeable { public static final Logger LOG = LoggerFactory.getLogger(CrawlDbReader.class); private MapFile.Reader[] readers = null; - + private void openReaders(String crawlDb, Configuration config) throws IOException { if (readers != null) return; FileSystem fs = FileSystem.get(config); readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb, CrawlDb.CURRENT_NAME), config); } - + private void closeReaders() { if (readers == null) return; for (int i = 0; i < readers.length; i++) { try { readers[i].close(); } catch (Exception e) { - + } } } - + public static class CrawlDatumCsvOutputFormat extends FileOutputFormat<Text,CrawlDatum> { protected static class LineRecordWriter implements RecordWriter<Text,CrawlDatum> { private DataOutputStream out; - public LineRecordWriter(DataOutputStream out) { this.out = out; try { - out.writeBytes("Url;Status code;Status name;Fetch Time;Modified Time;Retries since fetch;Retry interval seconds;Retry interval days;Score;Signature\n"); + out.writeBytes("Url;Status code;Status name;Fetch Time;Modified Time;Retries since fetch;Retry interval seconds;Retry interval days;Score;Signature;Metadata\n"); } catch (IOException e) {} } @@ -129,6 +130,18 @@ public class CrawlDbReader implements Cl out.writeByte('"'); out.writeBytes(value.getSignature() != null ? StringUtil.toHexString(value.getSignature()): "null"); out.writeByte('"'); + out.writeByte(';'); + out.writeByte('"'); + if (value.getMetaData() != null) { + for (Entry<Writable, Writable> e : value.getMetaData().entrySet()) { + out.writeBytes(e.getKey().toString()); + out.writeByte(':'); + out.writeBytes(e.getValue().toString()); + out.writeBytes("|||"); + } + } + out.writeByte('"'); + out.writeByte('\n'); } @@ -165,10 +178,10 @@ public class CrawlDbReader implements Cl } } } - + public static class CrawlDbStatCombiner implements Reducer<Text, LongWritable, Text, LongWritable> { LongWritable val = new LongWritable(); - + public CrawlDbStatCombiner() { } public void configure(JobConf job) { } public void close() {} @@ -249,7 +262,7 @@ public class CrawlDbReader implements Cl public static class CrawlDbTopNMapper implements Mapper<Text, CrawlDatum, FloatWritable, Text> { private static final FloatWritable fw = new FloatWritable(); private float min = 0.0f; - + public void configure(JobConf job) { long lmin = job.getLong("db.reader.topn.min", 0); if (lmin != 0) { @@ -264,11 +277,11 @@ public class CrawlDbReader implements Cl output.collect(fw, key); // invert mapping: score -> url } } - + public static class CrawlDbTopNReducer implements Reducer<FloatWritable, Text, FloatWritable, Text> { private long topN; private long count = 0L; - + public void reduce(FloatWritable key, Iterator<Text> values, OutputCollector<FloatWritable, Text> output, Reporter reporter) throws IOException { while (values.hasNext() && count < topN) { key.set(-key.get()); @@ -280,20 +293,20 @@ public class CrawlDbReader implements Cl public void configure(JobConf job) { topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks(); } - + public void close() {} } public void close() { closeReaders(); } - + public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics start: " + crawlDb); } - + Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis()); JobConf job = new NutchJob(config); @@ -339,14 +352,14 @@ public class CrawlDbReader implements Cl if (k.equals("scx")) { if (val.get() < value.get()) val.set(value.get()); } else if (k.equals("scn")) { - if (val.get() > value.get()) val.set(value.get()); + if (val.get() > value.get()) val.set(value.get()); } else { val.set(val.get() + value.get()); } } reader.close(); } - + if (LOG.isInfoEnabled()) { LOG.info("Statistics for CrawlDb: " + crawlDb); LongWritable totalCnt = stats.get("T"); @@ -374,7 +387,7 @@ public class CrawlDbReader implements Cl if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics: done"); } } - + public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException { Text key = new Text(url); CrawlDatum val = new CrawlDatum(); @@ -462,12 +475,12 @@ public class CrawlDbReader implements Cl } public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException { - + if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")"); LOG.info("CrawlDb db: " + crawlDb); } - + Path outFolder = new Path(output); Path tempDir = new Path(config.get("mapred.temp.dir", ".") + @@ -488,8 +501,8 @@ public class CrawlDbReader implements Cl // XXX hmmm, no setFloat() in the API ... :( job.setLong("db.reader.topn.min", Math.round(1000000.0 * min)); - JobClient.runJob(job); - + JobClient.runJob(job); + if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: collecting topN scores."); }