Author: cutting
Date: Tue Jun 7 14:51:28 2005
New Revision: 189453
URL: http://svn.apache.org/viewcvs?rev=189453&view=rev
Log:
First version of link db.
Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Inlink.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Inlinks.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
Modified:
lucene/nutch/branches/mapred/bin/nutch
lucene/nutch/branches/mapred/conf/nutch-default.xml
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileInputFormat.java
Modified: lucene/nutch/branches/mapred/bin/nutch
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch?rev=189453&r1=189452&r2=189453&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/bin/nutch (original)
+++ lucene/nutch/branches/mapred/bin/nutch Tue Jun 7 14:51:28 2005
@@ -35,6 +35,7 @@
echo " fetch fetch a segment's pages"
echo " parse parse a segment's pages"
echo " updatedb update crawl db from segments after fetching"
+ echo " invertlinks create a linkdb from parsed segments"
echo " index run the indexer on a segment's fetcher output"
echo " merge merge several segment indexes"
echo " dedup remove duplicates from a set of segment indexes"
@@ -138,6 +139,8 @@
CLASS=org.apache.nutch.crawl.ParseSegment
elif [ "$COMMAND" = "updatedb" ] ; then
CLASS=org.apache.nutch.crawl.CrawlDb
+elif [ "$COMMAND" = "invertlinks" ] ; then
+ CLASS=org.apache.nutch.crawl.LinkDb
elif [ "$COMMAND" = "index" ] ; then
CLASS=org.apache.nutch.indexer.IndexSegment
elif [ "$COMMAND" = "merge" ] ; then
Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=189453&r1=189452&r2=189453&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Tue Jun 7 14:51:28 2005
@@ -294,13 +294,6 @@
<description>If true, fetcher will log more verbosely.</description>
</property>
-<!-- parser properties -->
-<property>
- <name>parser.threads.parse</name>
- <value>10</value>
- <description>Number of ParserThreads ParseSegment should use.</description>
-</property>
-
<!-- i/o properties -->
<property>
Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Inlink.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Inlink.java?rev=189453&view=auto
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Inlink.java
(added)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Inlink.java
Tue Jun 7 14:51:28 2005
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import org.apache.nutch.io.*;
+
+/* An incoming link to a page. */
+public class Inlink implements Writable {
+
+ private String fromUrl;
+ private String anchor;
+
+ public Inlink() {}
+
+ public Inlink(String fromUrl, String anchor) {
+ this.fromUrl = fromUrl;
+ this.anchor = anchor;
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ fromUrl = UTF8.readString(in);
+ anchor = UTF8.readString(in);
+ }
+
+ /** Skips over one Inlink in the input. */
+ public static void skip(DataInput in) throws IOException {
+ UTF8.skip(in); // skip fromUrl
+ UTF8.skip(in); // skip anchor
+ }
+
+ public void write(DataOutput out) throws IOException {
+ UTF8.writeString(out, fromUrl);
+ UTF8.writeString(out, anchor);
+ }
+
+ public static Inlink read(DataInput in) throws IOException {
+ Inlink inlink = new Inlink();
+ inlink.readFields(in);
+ return inlink;
+ }
+
+ public String getFromUrl() { return fromUrl; }
+ public String getAnchor() { return anchor; }
+
+ public boolean equals(Object o) {
+ if (!(o instanceof Inlink))
+ return false;
+ Inlink other = (Inlink)o;
+ return
+ this.fromUrl.equals(other.fromUrl) &&
+ this.anchor.equals(other.anchor);
+ }
+
+ public int hashCode() {
+ return fromUrl.hashCode() ^ anchor.hashCode();
+ }
+
+ public String toString() {
+ return "fromUrl: " + fromUrl + " anchor: " + anchor;
+ }
+
+}
Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Inlinks.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Inlinks.java?rev=189453&view=auto
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Inlinks.java
(added)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Inlinks.java
Tue Jun 7 14:51:28 2005
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.nutch.io.*;
+
+/** A list of [EMAIL PROTECTED] Inlink}s. */
+public class Inlinks implements Writable {
+ private ArrayList inlinks = new ArrayList(1);
+
+ public void add(Inlink inlink) { inlinks.add(inlink); }
+
+ public void add(Inlinks inlinks) { this.inlinks.addAll(inlinks.inlinks); }
+
+ public int size() { return inlinks.size(); }
+
+ public Inlink get(int i) { return (Inlink)inlinks.get(i); }
+
+ public void clear() { inlinks.clear(); }
+
+ public void readFields(DataInput in) throws IOException {
+ int length = in.readInt();
+ inlinks.clear();
+ inlinks.ensureCapacity(length);
+ for (int i = 0; i < length; i++) {
+ add(Inlink.read(in));
+ }
+ }
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(inlinks.size());
+ for (int i = 0; i < inlinks.size(); i++) {
+ ((Writable)inlinks.get(i)).write(out);
+ }
+ }
+
+}
Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java?rev=189453&view=auto
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
(added)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
Tue Jun 7 14:51:28 2005
@@ -0,0 +1,146 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.util.*;
+import java.util.logging.*;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.net.*;
+import org.apache.nutch.util.*;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.parse.*;
+
+/** Maintains an inverted link map, listing incoming links for each url. */
+public class LinkDb extends NutchConfigured implements Mapper, Reducer {
+
+ public static final Logger LOG =
+ LogFormatter.getLogger("org.apache.nutch.crawl.LinkDb");
+
+ public static String CURRENT_NAME = "current";
+
+
+ public LinkDb() {
+ super(null);
+ }
+
+ /** Construct an LinkDb. */
+ public LinkDb(NutchConf conf) {
+ super(conf);
+ }
+
+ public void configure(JobConf job) {}
+
+ public void map(WritableComparable key, Writable value,
+ OutputCollector output) throws IOException {
+ String fromUrl = key.toString();
+ ParseData parseData = (ParseData)value;
+ Outlink[] outlinks = parseData.getOutlinks();
+ Inlinks inlinks = new Inlinks();
+ for (int i = 0; i < outlinks.length; i++) {
+ Outlink outlink = outlinks[i];
+ inlinks.clear();
+ inlinks.add(new Inlink(fromUrl, outlink.getAnchor()));
+ output.collect(new UTF8(outlink.getToUrl()), inlinks);
+ }
+ }
+
+ public void reduce(WritableComparable key, Iterator values,
+ OutputCollector output) throws IOException {
+ Inlinks result = null;
+ while (values.hasNext()) {
+ Inlinks inlinks = (Inlinks)values.next();
+ if (result == null) {
+ result = inlinks;
+ } else {
+ result.add(inlinks);
+ }
+ }
+ output.collect(key, result);
+ }
+
+
+ public void invert(File linkDb, File segmentsDir) throws IOException {
+ JobConf job = LinkDb.createJob(getConf(), linkDb);
+ job.setInputDir(segmentsDir);
+ job.set("mapred.input.subdir", ParseData.DIR_NAME);
+ JobClient.runJob(job);
+ LinkDb.install(job, linkDb);
+ }
+
+ public void invert(File linkDb, File[] segments) throws IOException {
+ JobConf job = LinkDb.createJob(getConf(), linkDb);
+ for (int i = 0; i < segments.length; i++) {
+ job.addInputDir(new File(segments[i], ParseData.DIR_NAME));
+ }
+ JobClient.runJob(job);
+ LinkDb.install(job, linkDb);
+ }
+
+ private static JobConf createJob(NutchConf config, File linkDb) {
+ File newLinkDb =
+ new File(linkDb,
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+ JobConf job = new JobConf(config);
+
+ job.setInt("partition.url.by.host.seed", new Random().nextInt());
+ job.setPartitionerClass(PartitionUrlByHost.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setInputKeyClass(UTF8.class);
+ job.setInputValueClass(ParseData.class);
+
+ job.setMapperClass(LinkDb.class);
+ //job.setCombinerClass(LinkDb.class);
+ job.setReducerClass(LinkDb.class);
+
+ job.setOutputDir(newLinkDb);
+ job.setOutputFormat(MapFileOutputFormat.class);
+ job.setOutputKeyClass(UTF8.class);
+ job.setOutputValueClass(Inlinks.class);
+
+ return job;
+ }
+
+ public static void install(JobConf job, File linkDb) throws IOException {
+ File newLinkDb = job.getOutputDir();
+ NutchFileSystem fs = new JobClient(job).getFs();
+ File old = new File(linkDb, "old");
+ File current = new File(linkDb, CURRENT_NAME);
+ fs.delete(old);
+ fs.rename(current, old);
+ fs.rename(newLinkDb, current);
+ fs.delete(old);
+ }
+
+ public static void main(String[] args) throws Exception {
+ LinkDb linkDb = new LinkDb(NutchConf.get());
+
+ if (args.length < 2) {
+ System.err.println("Usage: <linkdb> <segments>");
+ return;
+ }
+
+ linkDb.invert(new File(args[0]), new File(args[1]));
+ }
+
+
+
+}
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java?rev=189453&r1=189452&r2=189453&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java
(original)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java
Tue Jun 7 14:51:28 2005
@@ -38,18 +38,31 @@
protected File[] listFiles(NutchFileSystem fs, JobConf job)
throws IOException {
File[] dirs = job.getInputDirs();
- ArrayList files = new ArrayList();
+ String subdir = job.get("mapred.input.subdir");
+ ArrayList result = new ArrayList();
for (int i = 0; i < dirs.length; i++) {
File[] dir = fs.listFiles(dirs[i]);
if (dir != null) {
- files.addAll(Arrays.asList(dir));
+ for (int j = 0; j < dir.length; j++) {
+ File file = dir[j];
+ if (subdir != null) {
+ File[] subFiles = fs.listFiles(new File(file, subdir));
+ if (subFiles != null) {
+ for (int k = 0; k < subFiles.length; k++) {
+ result.add(subFiles[k]);
+ }
+ }
+ } else {
+ result.add(file);
+ }
+ }
}
}
- if (files.size() == 0) {
+ if (result.size() == 0) {
throw new IOException("No input files in: "+job.getInputDirs());
}
- return (File[])files.toArray(new File[files.size()]);
+ return (File[])result.toArray(new File[result.size()]);
}
/** Splits files returned by {#listFiles(NutchFileSystem,JobConf) when
@@ -59,7 +72,14 @@
File[] files = listFiles(fs, job);
- long totalSize = 0;
+ for (int i = 0; i < files.length; i++) { // check we have valid files
+ File file = files[i];
+ if (fs.isDirectory(file) || !fs.exists(file)) {
+ throw new IOException("Not a file: "+files[i]);
+ }
+ }
+
+ long totalSize = 0; // compute total size
for (int i = 0; i < files.length; i++) {
totalSize += fs.getLength(files[i]);
}
@@ -67,7 +87,7 @@
long bytesPerSplit = totalSize / numSplits;
long maxPerSplit = bytesPerSplit + (long)(bytesPerSplit*SPLIT_SLOP);
- ArrayList splits = new ArrayList(numSplits);
+ ArrayList splits = new ArrayList(numSplits); // generate splits
for (int i = 0; i < files.length; i++) {
File file = files[i];
long length = fs.getLength(file);
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileInputFormat.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileInputFormat.java?rev=189453&r1=189452&r2=189453&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileInputFormat.java
(original)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileInputFormat.java
Tue Jun 7 14:51:28 2005
@@ -22,6 +22,7 @@
import org.apache.nutch.fs.NutchFileSystem;
import org.apache.nutch.io.SequenceFile;
+import org.apache.nutch.io.MapFile;
import org.apache.nutch.io.Writable;
import org.apache.nutch.io.WritableComparable;
import org.apache.nutch.io.LongWritable;
@@ -37,7 +38,7 @@
for (int i = 0; i < files.length; i++) {
File file = files[i];
if (file.isDirectory()) { // it's a MapFile
- files[i] = new File(file, "data"); // use the data file
+ files[i] = new File(file, MapFile.DATA_FILE_NAME); // use the data file
}
}
return files;
-------------------------------------------------------
This SF.Net email is sponsored by: NEC IT Guy Games. How far can you shotput
a projector? How fast can you ride your desk chair down the office luge track?
If you want to score the big prize, get to know the little guy.
Play to win an NEC 61" plasma display: http://www.necitguy.com/?r
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs