Author: markus
Date: Tue Jul 2 08:36:13 2013
New Revision: 1498830
URL: http://svn.apache.org/r1498830
Log:
NUTCH-1327 QueryStringNormalizer
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1498830&r1=1498829&r2=1498830&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 2 08:36:13 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1327 QueryStringNormalizer (markus)
+
* NUTCH-1593 Normalize option missing in SegmentMerger's usage (markus)
* NUTCH-1580 index-static returns object instead of value for index.static
(Antoinette, lewismc, snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1498830&r1=1498829&r2=1498830&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Tue Jul 2
08:36:13 2013
@@ -24,6 +24,7 @@ import java.net.URL;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -41,6 +42,7 @@ import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
@@ -64,42 +66,41 @@ import org.apache.nutch.util.StringUtil;
/**
* Read utility for the CrawlDB.
- *
+ *
* @author Andrzej Bialecki
- *
+ *
*/
public class CrawlDbReader implements Closeable {
public static final Logger LOG =
LoggerFactory.getLogger(CrawlDbReader.class);
private MapFile.Reader[] readers = null;
-
+
private void openReaders(String crawlDb, Configuration config) throws
IOException {
if (readers != null) return;
FileSystem fs = FileSystem.get(config);
readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb,
CrawlDb.CURRENT_NAME), config);
}
-
+
private void closeReaders() {
if (readers == null) return;
for (int i = 0; i < readers.length; i++) {
try {
readers[i].close();
} catch (Exception e) {
-
+
}
}
}
-
+
public static class CrawlDatumCsvOutputFormat extends
FileOutputFormat<Text,CrawlDatum> {
protected static class LineRecordWriter implements
RecordWriter<Text,CrawlDatum> {
private DataOutputStream out;
-
public LineRecordWriter(DataOutputStream out) {
this.out = out;
try {
- out.writeBytes("Url;Status code;Status name;Fetch Time;Modified
Time;Retries since fetch;Retry interval seconds;Retry interval
days;Score;Signature\n");
+ out.writeBytes("Url;Status code;Status name;Fetch Time;Modified
Time;Retries since fetch;Retry interval seconds;Retry interval
days;Score;Signature;Metadata\n");
} catch (IOException e) {}
}
@@ -129,6 +130,18 @@ public class CrawlDbReader implements Cl
out.writeByte('"');
out.writeBytes(value.getSignature() != null ?
StringUtil.toHexString(value.getSignature()): "null");
out.writeByte('"');
+ out.writeByte(';');
+ out.writeByte('"');
+ if (value.getMetaData() != null) {
+ for (Entry<Writable, Writable> e : value.getMetaData().entrySet())
{
+ out.writeBytes(e.getKey().toString());
+ out.writeByte(':');
+ out.writeBytes(e.getValue().toString());
+ out.writeBytes("|||");
+ }
+ }
+ out.writeByte('"');
+
out.writeByte('\n');
}
@@ -165,10 +178,10 @@ public class CrawlDbReader implements Cl
}
}
}
-
+
public static class CrawlDbStatCombiner implements Reducer<Text,
LongWritable, Text, LongWritable> {
LongWritable val = new LongWritable();
-
+
public CrawlDbStatCombiner() { }
public void configure(JobConf job) { }
public void close() {}
@@ -249,7 +262,7 @@ public class CrawlDbReader implements Cl
public static class CrawlDbTopNMapper implements Mapper<Text, CrawlDatum,
FloatWritable, Text> {
private static final FloatWritable fw = new FloatWritable();
private float min = 0.0f;
-
+
public void configure(JobConf job) {
long lmin = job.getLong("db.reader.topn.min", 0);
if (lmin != 0) {
@@ -264,11 +277,11 @@ public class CrawlDbReader implements Cl
output.collect(fw, key); // invert mapping: score -> url
}
}
-
+
public static class CrawlDbTopNReducer implements Reducer<FloatWritable,
Text, FloatWritable, Text> {
private long topN;
private long count = 0L;
-
+
public void reduce(FloatWritable key, Iterator<Text> values,
OutputCollector<FloatWritable, Text> output, Reporter reporter) throws
IOException {
while (values.hasNext() && count < topN) {
key.set(-key.get());
@@ -280,20 +293,20 @@ public class CrawlDbReader implements Cl
public void configure(JobConf job) {
topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks();
}
-
+
public void close() {}
}
public void close() {
closeReaders();
}
-
+
public void processStatJob(String crawlDb, Configuration config, boolean
sort) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb statistics start: " + crawlDb);
}
-
+
Path tmpFolder = new Path(crawlDb, "stat_tmp" +
System.currentTimeMillis());
JobConf job = new NutchJob(config);
@@ -339,14 +352,14 @@ public class CrawlDbReader implements Cl
if (k.equals("scx")) {
if (val.get() < value.get()) val.set(value.get());
} else if (k.equals("scn")) {
- if (val.get() > value.get()) val.set(value.get());
+ if (val.get() > value.get()) val.set(value.get());
} else {
val.set(val.get() + value.get());
}
}
reader.close();
}
-
+
if (LOG.isInfoEnabled()) {
LOG.info("Statistics for CrawlDb: " + crawlDb);
LongWritable totalCnt = stats.get("T");
@@ -374,7 +387,7 @@ public class CrawlDbReader implements Cl
if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics: done"); }
}
-
+
public CrawlDatum get(String crawlDb, String url, Configuration config)
throws IOException {
Text key = new Text(url);
CrawlDatum val = new CrawlDatum();
@@ -462,12 +475,12 @@ public class CrawlDbReader implements Cl
}
public void processTopNJob(String crawlDb, long topN, float min, String
output, Configuration config) throws IOException {
-
+
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
LOG.info("CrawlDb db: " + crawlDb);
}
-
+
Path outFolder = new Path(output);
Path tempDir =
new Path(config.get("mapred.temp.dir", ".") +
@@ -488,8 +501,8 @@ public class CrawlDbReader implements Cl
// XXX hmmm, no setFloat() in the API ... :(
job.setLong("db.reader.topn.min", Math.round(1000000.0 * min));
- JobClient.runJob(job);
-
+ JobClient.runJob(job);
+
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb topN: collecting topN scores.");
}