Author: jnioche
Date: Thu Aug 29 11:27:45 2013
New Revision: 1518594
URL: http://svn.apache.org/r1518594
Log:
(NUTCH-1622) Create Outlinks with metadata
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1518594&r1=1518593&r2=1518594&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Aug 29 11:27:45 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1622 Create Outlinks with metadata (jnioche)
+
* NUTCH-1629 Injector skips empty lines in seed files (kaveh minooie via
jnioche)
* NUTCH-911 protocol-file to return proper protocol status (Peter Lundberg via
snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java?rev=1518594&r1=1518593&r2=1518594&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java Thu Aug 29
11:27:45 2013
@@ -17,65 +17,116 @@
package org.apache.nutch.parse;
-import java.io.*;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
import java.net.MalformedURLException;
+import java.util.Map.Entry;
-import org.apache.hadoop.io.*;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
/* An outgoing link from a page. */
public class Outlink implements Writable {
- private String toUrl;
- private String anchor;
-
- public Outlink() {}
-
- public Outlink(String toUrl, String anchor) throws MalformedURLException {
- this.toUrl = toUrl;
- if (anchor == null) anchor = "";
- this.anchor = anchor;
- }
-
- public void readFields(DataInput in) throws IOException {
- toUrl = Text.readString(in);
- anchor = Text.readString(in);
- }
-
- /** Skips over one Outlink in the input. */
- public static void skip(DataInput in) throws IOException {
- Text.skip(in); // skip toUrl
- Text.skip(in); // skip anchor
- }
-
- public void write(DataOutput out) throws IOException {
- Text.writeString(out, toUrl);
- Text.writeString(out, anchor);
- }
-
- public static Outlink read(DataInput in) throws IOException {
- Outlink outlink = new Outlink();
- outlink.readFields(in);
- return outlink;
- }
-
- public String getToUrl() { return toUrl; }
- public String getAnchor() { return anchor; }
-
- public void setUrl(String toUrl) {
- this.toUrl = toUrl;
- }
-
- public boolean equals(Object o) {
- if (!(o instanceof Outlink))
- return false;
- Outlink other = (Outlink)o;
- return
- this.toUrl.equals(other.toUrl) &&
- this.anchor.equals(other.anchor);
- }
-
- public String toString() {
- return "toUrl: " + toUrl + " anchor: " + anchor; // removed "\n".
toString, not printLine... WD.
- }
+ private String toUrl;
+ private String anchor;
+ private MapWritable md;
+
+ public Outlink() {
+ }
+
+ public Outlink(String toUrl, String anchor) throws MalformedURLException {
+ this.toUrl = toUrl;
+ if (anchor == null)
+ anchor = "";
+ this.anchor = anchor;
+ md = null;
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ toUrl = Text.readString(in);
+ anchor = Text.readString(in);
+ boolean hasMD = in.readBoolean();
+ if (hasMD) {
+ md = new org.apache.hadoop.io.MapWritable();
+ md.readFields(in);
+ } else
+ md = null;
+ }
+
+ /** Skips over one Outlink in the input. */
+ public static void skip(DataInput in) throws IOException {
+ Text.skip(in); // skip toUrl
+ Text.skip(in); // skip anchor
+ boolean hasMD = in.readBoolean();
+ if (hasMD) {
+ MapWritable metadata = new org.apache.hadoop.io.MapWritable();
+ metadata.readFields(in);
+ ;
+ }
+ }
+
+ public void write(DataOutput out) throws IOException {
+ Text.writeString(out, toUrl);
+ Text.writeString(out, anchor);
+ if (md != null && md.size() > 0) {
+ out.writeBoolean(true);
+ md.write(out);
+ } else {
+ out.writeBoolean(false);
+ }
+ }
+
+ public static Outlink read(DataInput in) throws IOException {
+ Outlink outlink = new Outlink();
+ outlink.readFields(in);
+ return outlink;
+ }
+
+ public String getToUrl() {
+ return toUrl;
+ }
+
+ public void setUrl(String toUrl) {
+ this.toUrl = toUrl;
+ }
+
+ public String getAnchor() {
+ return anchor;
+ }
+
+ public MapWritable getMetadata() {
+ return md;
+ }
+
+ public void setMetadata(MapWritable md) {
+ this.md = md;
+ }
+
+ public boolean equals(Object o) {
+ if (!(o instanceof Outlink))
+ return false;
+ Outlink other = (Outlink) o;
+ return this.toUrl.equals(other.toUrl)
+ && this.anchor.equals(other.anchor);
+ }
+
+ public String toString() {
+ StringBuffer repr = new StringBuffer("toUrl: ");
+ repr.append(toUrl);
+ repr.append(" anchor: ");
+ repr.append(anchor);
+ if (md != null && !md.isEmpty()) {
+ for (Entry<Writable, Writable> e : md.entrySet()) {
+ repr.append(" ");
+ repr.append(e.getKey());
+ repr.append(": ");
+ repr.append(e.getValue());
+ }
+ }
+ return repr.toString();
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1518594&r1=1518593&r2=1518594&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Thu Aug
29 11:27:45 2013
@@ -233,6 +233,15 @@ public class ParseOutputFormat implement
CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED,
interval);
Text targetUrl = new Text(toUrl);
+
+ // see if the outlink has any metadata attached
+ // and if so pass that to the crawldatum so that
+ // the initial score or distribution can use that
+ MapWritable outlinkMD = links[i].getMetadata();
+ if (outlinkMD!=null){
+ target.getMetaData().putAll(outlinkMD);
+ }
+
try {
scfilters.initialScore(targetUrl, target);
} catch (ScoringFilterException e) {