Author: ab
Date: Wed Nov 23 05:55:30 2005
New Revision: 348440
URL: http://svn.apache.org/viewcvs?rev=348440&view=rev
Log:
Refactor NutchBean to extract LinkDbReader and move it to where it belongs.
NutchBean will now use a facade that implements HitInlinks.
Add command-line interface for reading inlink information, and dumping
the link db.
Add command "readlinkdb" to bin/nutch.
Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java
(with props)
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
(with props)
Removed:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbReader.java
Modified:
lucene/nutch/branches/mapred/bin/nutch
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
Modified: lucene/nutch/branches/mapred/bin/nutch
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch?rev=348440&r1=348439&r2=348440&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/bin/nutch (original)
+++ lucene/nutch/branches/mapred/bin/nutch Wed Nov 23 05:55:30 2005
@@ -30,6 +30,7 @@
echo "where COMMAND is one of:"
echo " crawl one-step crawler for intranets"
echo " readdb read / dump crawl db"
+ echo " readlinkdb read / dump link db"
echo " admin database administration, including creation"
echo " inject inject new urls into the database"
echo " generate generate new segments to fetch"
@@ -135,6 +136,8 @@
CLASS=org.apache.nutch.crawl.ParseSegment
elif [ "$COMMAND" = "readdb" ] ; then
CLASS=org.apache.nutch.crawl.CrawlDbReader
+elif [ "$COMMAND" = "readlinkdb" ] ; then
+ CLASS=org.apache.nutch.crawl.LinkDbReader
elif [ "$COMMAND" = "updatedb" ] ; then
CLASS=org.apache.nutch.crawl.CrawlDb
elif [ "$COMMAND" = "invertlinks" ] ; then
Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=348440&view=auto
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java
(added)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java
Wed Nov 23 05:55:30 2005
@@ -0,0 +1,111 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.io.File;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.mapred.lib.HashPartitioner;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
+
+import java.util.logging.Logger;
+
+/** . */
+public class LinkDbReader {
+ public static final Logger LOG =
LogFormatter.getLogger(LinkDbReader.class.getName());
+
+ private static final Partitioner PARTITIONER = new HashPartitioner();
+
+ private NutchFileSystem fs;
+ private File directory;
+ private MapFile.Reader[] readers;
+
+ public LinkDbReader(NutchFileSystem fs, File directory) {
+ this.fs = fs;
+ this.directory = directory;
+ }
+
+ public String[] getAnchors(UTF8 url) throws IOException {
+ Inlinks inlinks = getInlinks(url);
+ if (inlinks == null)
+ return null;
+ return inlinks.getAnchors();
+ }
+
+ public Inlinks getInlinks(UTF8 url) throws IOException {
+
+ synchronized (this) {
+ if (readers == null) {
+ readers = MapFileOutputFormat.getReaders
+ (fs, new File(directory, LinkDb.CURRENT_NAME));
+ }
+ }
+
+ return (Inlinks)MapFileOutputFormat.getEntry
+ (readers, PARTITIONER, url, new Inlinks());
+ }
+
+ public static void processDumpJob(String linkdb, String output, NutchConf
config) throws IOException {
+ LOG.info("LinkDb dump: starting");
+ LOG.info("LinkDb db: " + linkdb);
+ File outFolder = new File(output);
+
+ JobConf job = new JobConf(config);
+
+ job.addInputDir(new File(linkdb, LinkDb.CURRENT_NAME));
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setInputKeyClass(UTF8.class);
+ job.setInputValueClass(Inlinks.class);
+
+ job.setOutputDir(outFolder);
+ job.setOutputFormat(TextOutputFormat.class);
+ job.setOutputKeyClass(UTF8.class);
+ job.setOutputValueClass(Inlinks.class);
+
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ if (args.length < 2) {
+ System.err.println("LinkDbReader <linkdb> {-dump <out_dir> | -url
<url>)");
+ System.err.println("\t-dump <out_dir>\tdump whole link db to a text file
in <out_dir>");
+ System.err.println("\t-url <url>\tprint information about <url> to
System.out");
+ return;
+ }
+
+ if (args[1].equals("-dump")) {
+ LinkDbReader.processDumpJob(args[0], args[2], NutchConf.get());
+ } else if (args[1].equals("-url")) {
+ LinkDbReader dbr = new LinkDbReader(NutchFileSystem.get(), new
File(args[0]));
+ Inlinks links = dbr.getInlinks(new UTF8(args[2]));
+ if (links == null) {
+ System.out.println(" - no link information.");
+ } else {
+ for (int i = 0; i < links.size(); i++) {
+ System.out.println(links.get(i).toString());
+ }
+ }
+ } else {
+ System.err.println("Error: wrong argument " + args[1]);
+ return;
+ }
+ }
+}
Propchange:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java?rev=348440&view=auto
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
(added)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
Wed Nov 23 05:55:30 2005
@@ -0,0 +1,32 @@
+/*
+ * Created on Nov 23, 2005
+ * Author: Andrzej Bialecki <[EMAIL PROTECTED]>
+ *
+ */
+package org.apache.nutch.searcher;
+
+import java.io.IOException;
+
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.LinkDbReader;
+import org.apache.nutch.fs.NutchFileSystem;
+import org.apache.nutch.io.UTF8;
+
+import java.io.File;
+
+public class LinkDbInlinks implements HitInlinks {
+
+ private LinkDbReader linkdb = null;
+
+ public LinkDbInlinks(NutchFileSystem fs, File dir) {
+ linkdb = new LinkDbReader(fs, dir);
+ }
+
+ public String[] getAnchors(HitDetails details) throws IOException {
+ return linkdb.getAnchors(new UTF8(details.getValue("url")));
+ }
+
+ public Inlinks getInlinks(HitDetails details) throws IOException {
+ return linkdb.getInlinks(new UTF8(details.getValue("url")));
+ }
+}
Propchange:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java?rev=348440&r1=348439&r2=348440&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
(original)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
Wed Nov 23 05:55:30 2005
@@ -26,6 +26,7 @@
import org.apache.nutch.parse.*;
import org.apache.nutch.indexer.*;
import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.LinkDbReader;
/**
* One stop shopping for search-related functionality.
@@ -127,7 +128,7 @@
this.content = segments;
LOG.info("opening linkdb in " + linkDb);
- this.linkDb = new LinkDbReader(fs, linkDb);
+ this.linkDb = new LinkDbInlinks(fs, linkDb);
}
private void init(DistributedSearch.Client client) {