Author: cutting
Date: Thu Sep 1 14:03:51 2005
New Revision: 265778
URL: http://svn.apache.org/viewcvs?rev=265778&view=rev
Log:
Fix anchor & inlink access.
Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitInlinks.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbReader.java
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitContent.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
lucene/nutch/branches/mapred/src/web/jsp/anchors.jsp
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java?rev=265778&r1=265777&r2=265778&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
Thu Sep 1 14:03:51 2005
@@ -131,9 +131,6 @@
JobConf job = new JobConf(config);
- job.setInt("partition.url.by.host.seed", new Random().nextInt());
- job.setPartitionerClass(PartitionUrlByHost.class);
-
job.setInputFormat(SequenceFileInputFormat.class);
job.setInputKeyClass(UTF8.class);
job.setInputValueClass(ParseData.class);
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java?rev=265778&r1=265777&r2=265778&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
(original)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
Thu Sep 1 14:03:51 2005
@@ -18,6 +18,7 @@
import java.io.IOException;
import java.io.File;
+import java.util.Arrays;
import org.apache.nutch.fs.NutchFileSystem;
@@ -48,6 +49,31 @@
public void close(Reporter reporter) throws IOException { out.close();}
};
- }
+ }
+
+ /** Open the output generated by this format. */
+ public static MapFile.Reader[] getReaders(NutchFileSystem fs, File dir)
+ throws IOException {
+ File[] names = fs.listFiles(dir);
+
+ // sort names, so that hash partitioning works
+ Arrays.sort(names);
+
+ MapFile.Reader[] parts = new MapFile.Reader[names.length];
+ for (int i = 0; i < names.length; i++) {
+ parts[i] = new MapFile.Reader(fs, names[i].toString());
+ }
+ return parts;
+ }
+
+ /** Get an entry from output generated by this class. */
+ public static Writable getEntry(MapFile.Reader[] readers,
+ Partitioner partitioner,
+ WritableComparable key,
+ Writable value) throws IOException {
+ int part = partitioner.getPartition(key, value, readers.length);
+ return readers[part].get(key, value);
+ }
+
}
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=265778&r1=265777&r2=265778&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java
(original)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java
Thu Sep 1 14:03:51 2005
@@ -24,6 +24,7 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.io.*;
import org.apache.nutch.ipc.RPC;
@@ -37,7 +38,7 @@
/** The distributed search protocol. */
public interface Protocol
- extends Searcher, HitDetailer, HitSummarizer, HitContent {
+ extends Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks {
/** The name of the segments searched by this node. */
String[] getSegmentNames();
@@ -71,7 +72,8 @@
/** The search client. */
public static class Client extends Thread
- implements Searcher, HitDetailer, HitSummarizer, HitContent, Runnable {
+ implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks,
+ Runnable {
private InetSocketAddress[] defaultAddresses;
private InetSocketAddress[] liveAddresses;
@@ -293,6 +295,10 @@
public String[] getAnchors(HitDetails hit) throws IOException {
return getRemote(hit).getAnchors(hit);
+ }
+
+ public Inlinks getInlinks(HitDetails hit) throws IOException {
+ return getRemote(hit).getInlinks(hit);
}
public long getFetchDate(HitDetails hit) throws IOException {
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=265778&r1=265777&r2=265778&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
(original)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
Thu Sep 1 14:03:51 2005
@@ -33,30 +33,36 @@
import org.apache.nutch.indexer.*;
import org.apache.nutch.mapred.*;
import org.apache.nutch.mapred.lib.*;
+import org.apache.nutch.crawl.*;
/** Implements [EMAIL PROTECTED] HitSummarizer} and [EMAIL PROTECTED]
HitContent} for a set of
* fetched segments. */
public class FetchedSegments implements HitSummarizer, HitContent {
private static class Segment {
+ private static final Partitioner PARTITIONER = new HashPartitioner();
+
private NutchFileSystem nfs;
private File segmentDir;
private MapFile.Reader[] content;
private MapFile.Reader[] parseText;
private MapFile.Reader[] parseData;
-
- private Partitioner partitioner = new HashPartitioner();
+ private MapFile.Reader[] crawl;
public Segment(NutchFileSystem nfs, File segmentDir) throws IOException {
this.nfs = nfs;
this.segmentDir = segmentDir;
}
- public FetcherOutput getFetcherOutput(UTF8 url) throws IOException {
- throw new UnsupportedOperationException();
+ public CrawlDatum getCrawlDatum(UTF8 url) throws IOException {
+ synchronized (this) {
+ if (crawl == null)
+ crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
+ }
+ return (CrawlDatum)getEntry(crawl, url, new CrawlDatum());
}
-
+
public byte[] getContent(UTF8 url) throws IOException {
synchronized (this) {
if (content == null)
@@ -82,23 +88,12 @@
}
private MapFile.Reader[] getReaders(String subDir) throws IOException {
- File[] names = nfs.listFiles(new File(segmentDir, subDir));
-
- // sort names, so that hash partitioning works
- Arrays.sort(names);
-
- MapFile.Reader[] parts = new MapFile.Reader[names.length];
- for (int i = 0; i < names.length; i++) {
- parts[i] = new MapFile.Reader(nfs, names[i].toString());
- }
- return parts;
+ return MapFileOutputFormat.getReaders(nfs, new File(segmentDir, subDir));
}
- // hash the url to figure out which part its in
private Writable getEntry(MapFile.Reader[] readers, UTF8 url,
Writable entry) throws IOException {
- int part = partitioner.getPartition(url, null, readers.length);
- return readers[part].get(url, entry);
+ return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
}
}
@@ -134,14 +129,9 @@
return getSegment(details).getParseData(getUrl(details));
}
- public String[] getAnchors(HitDetails details) throws IOException {
- return getSegment(details).getFetcherOutput(getUrl(details))
- .getFetchListEntry().getAnchors();
- }
-
public long getFetchDate(HitDetails details) throws IOException {
- return getSegment(details).getFetcherOutput(getUrl(details))
- .getFetchDate();
+ return getSegment(details).getCrawlDatum(getUrl(details))
+ .getFetchTime();
}
public ParseText getParseText(HitDetails details) throws IOException {
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitContent.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitContent.java?rev=265778&r1=265777&r2=265778&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitContent.java
(original)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitContent.java
Thu Sep 1 14:03:51 2005
@@ -32,10 +32,7 @@
/** Returns the ParseText of a hit document. */
ParseText getParseText(HitDetails details) throws IOException;
- /** Returns the anchors of a hit document. */
- String[] getAnchors(HitDetails details) throws IOException;
-
- /** Returns the anchors of a hit document. */
+ /** Returns the fetch date of a hit document. */
long getFetchDate(HitDetails details) throws IOException;
}
Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitInlinks.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitInlinks.java?rev=265778&view=auto
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitInlinks.java
(added)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitInlinks.java
Thu Sep 1 14:03:51 2005
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import java.io.IOException;
+
+import org.apache.nutch.crawl.Inlinks;
+
+/** Service that returns information about incoming links to a hit. */
+public interface HitInlinks {
+ /** Returns the anchors of a hit document. */
+ String[] getAnchors(HitDetails details) throws IOException;
+
+ /** Return the inlinks of a hit document. */
+ Inlinks getInlinks(HitDetails details) throws IOException;
+}
Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbReader.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbReader.java?rev=265778&view=auto
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbReader.java
(added)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbReader.java
Thu Sep 1 14:03:51 2005
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import java.io.IOException;
+import java.io.File;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.mapred.lib.HashPartitioner;
+import org.apache.nutch.crawl.*;
+
+import java.io.IOException;
+
+/** . */
+public class LinkDbReader implements HitInlinks {
+ private static final Partitioner PARTITIONER = new HashPartitioner();
+
+ private NutchFileSystem fs;
+ private File directory;
+ private MapFile.Reader[] readers;
+
+ public LinkDbReader(NutchFileSystem fs, File directory) {
+ this.fs = fs;
+ this.directory = directory;
+ }
+
+ public String[] getAnchors(HitDetails details) throws IOException {
+ Inlinks inlinks = getInlinks(details);
+ if (inlinks == null)
+ return null;
+ return inlinks.getAnchors();
+ }
+
+ public Inlinks getInlinks(HitDetails details) throws IOException {
+
+ synchronized (this) {
+ if (readers == null) {
+ readers = MapFileOutputFormat.getReaders
+ (fs, new File(directory, LinkDb.CURRENT_NAME));
+ }
+ }
+
+ return (Inlinks)MapFileOutputFormat.getEntry
+ (readers, PARTITIONER,
+ new UTF8(details.getValue("url")),
+ new Inlinks());
+ }
+}
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java?rev=265778&r1=265777&r2=265778&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
(original)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
Thu Sep 1 14:03:51 2005
@@ -26,13 +26,14 @@
import org.apache.nutch.util.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.indexer.*;
+import org.apache.nutch.crawl.Inlinks;
/**
* One stop shopping for search-related functionality.
* @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $
*/
public class NutchBean
- implements Searcher, HitDetailer, HitSummarizer, HitContent,
+ implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks,
DistributedSearch.Protocol {
public static final Logger LOG =
@@ -50,6 +51,7 @@
private HitDetailer detailer;
private HitSummarizer summarizer;
private HitContent content;
+ private HitInlinks linkDb;
private float RAW_HITS_FACTOR =
NutchConf.get().getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
@@ -83,11 +85,13 @@
} else {
init(new File(dir, "index"),
new File(dir, "indexes"),
- new File(dir, "segments"));
+ new File(dir, "segments"),
+ new File(dir, "linkdb"));
}
}
- private void init(File indexDir, File indexesDir, File segmentsDir)
+ private void init(File indexDir, File indexesDir, File segmentsDir,
+ File linkDb)
throws IOException {
IndexSearcher indexSearcher;
if (fs.exists(indexDir)) {
@@ -117,11 +121,14 @@
FetchedSegments segments = new FetchedSegments(fs, segmentsDir.toString());
this.segmentNames = segments.getSegmentNames();
-
+
this.searcher = indexSearcher;
this.detailer = indexSearcher;
this.summarizer = segments;
this.content = segments;
+
+ LOG.info("opening linkdb in " + linkDb);
+ this.linkDb = new LinkDbReader(fs, linkDb);
}
private void init(DistributedSearch.Client client) throws IOException {
@@ -130,6 +137,7 @@
this.detailer = client;
this.summarizer = client;
this.content = client;
+ this.linkDb = client;
}
@@ -317,7 +325,11 @@
}
public String[] getAnchors(HitDetails hit) throws IOException {
- return content.getAnchors(hit);
+ return linkDb.getAnchors(hit);
+ }
+
+ public Inlinks getInlinks(HitDetails hit) throws IOException {
+ return linkDb.getInlinks(hit);
}
public long getFetchDate(HitDetails hit) throws IOException {
Modified: lucene/nutch/branches/mapred/src/web/jsp/anchors.jsp
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/web/jsp/anchors.jsp?rev=265778&r1=265777&r2=265778&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/web/jsp/anchors.jsp (original)
+++ lucene/nutch/branches/mapred/src/web/jsp/anchors.jsp Thu Sep 1 14:03:51
2005
@@ -55,8 +55,10 @@
<ul>
<%
String[] anchors = bean.getAnchors(details);
- for (int i = 0; i < anchors.length; i++) {
+ if (anchors != null) {
+ for (int i = 0; i < anchors.length; i++) {
%><li><%=Entities.encode(anchors[i])%>
+<% } %>
<% } %>
</ul>