Author: ab
Date: Wed Nov 23 05:55:30 2005
New Revision: 348440

URL: http://svn.apache.org/viewcvs?rev=348440&view=rev
Log:
Refactor NutchBean to extract LinkDbReader and move it to where it belongs.
NutchBean will now use a facade that implements HitInlinks.

Add command-line interface for reading inlink information, and dumping
the link db.

Add command "readlinkdb" to bin/nutch.

Added:
    
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java  
 (with props)
    
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
   (with props)
Removed:
    
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbReader.java
Modified:
    lucene/nutch/branches/mapred/bin/nutch
    
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java

Modified: lucene/nutch/branches/mapred/bin/nutch
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch?rev=348440&r1=348439&r2=348440&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/bin/nutch (original)
+++ lucene/nutch/branches/mapred/bin/nutch Wed Nov 23 05:55:30 2005
@@ -30,6 +30,7 @@
   echo "where COMMAND is one of:"
   echo "  crawl             one-step crawler for intranets"
   echo "  readdb            read / dump crawl db"
+  echo "  readlinkdb        read / dump link db"
   echo "  admin             database administration, including creation"
   echo "  inject            inject new urls into the database"
   echo "  generate          generate new segments to fetch"
@@ -135,6 +136,8 @@
   CLASS=org.apache.nutch.crawl.ParseSegment
 elif [ "$COMMAND" = "readdb" ] ; then
   CLASS=org.apache.nutch.crawl.CrawlDbReader
+elif [ "$COMMAND" = "readlinkdb" ] ; then
+  CLASS=org.apache.nutch.crawl.LinkDbReader
 elif [ "$COMMAND" = "updatedb" ] ; then
   CLASS=org.apache.nutch.crawl.CrawlDb
 elif [ "$COMMAND" = "invertlinks" ] ; then

Added: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=348440&view=auto
==============================================================================
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java 
(added)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java 
Wed Nov 23 05:55:30 2005
@@ -0,0 +1,111 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.io.File;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.mapred.lib.HashPartitioner;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
+
+import java.util.logging.Logger;
+
+/** . */
+public class LinkDbReader {
+  public static final Logger LOG = 
LogFormatter.getLogger(LinkDbReader.class.getName());
+
+  private static final Partitioner PARTITIONER = new HashPartitioner();
+
+  private NutchFileSystem fs;
+  private File directory;
+  private MapFile.Reader[] readers;
+
+  public LinkDbReader(NutchFileSystem fs, File directory) {
+    this.fs = fs;
+    this.directory = directory;
+  }
+
+  public String[] getAnchors(UTF8 url) throws IOException {
+    Inlinks inlinks = getInlinks(url);
+    if (inlinks == null)
+      return null;
+    return inlinks.getAnchors();
+  }
+
+  public Inlinks getInlinks(UTF8 url) throws IOException {
+
+    synchronized (this) {
+      if (readers == null) {
+        readers = MapFileOutputFormat.getReaders
+          (fs, new File(directory, LinkDb.CURRENT_NAME));
+      }
+    }
+    
+    return (Inlinks)MapFileOutputFormat.getEntry
+      (readers, PARTITIONER, url, new Inlinks());
+  }
+  
+  public static void processDumpJob(String linkdb, String output, NutchConf 
config) throws IOException {
+    LOG.info("LinkDb dump: starting");
+    LOG.info("LinkDb db: " + linkdb);
+    File outFolder = new File(output);
+
+    JobConf job = new JobConf(config);
+
+    job.addInputDir(new File(linkdb, LinkDb.CURRENT_NAME));
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setInputKeyClass(UTF8.class);
+    job.setInputValueClass(Inlinks.class);
+
+    job.setOutputDir(outFolder);
+    job.setOutputFormat(TextOutputFormat.class);
+    job.setOutputKeyClass(UTF8.class);
+    job.setOutputValueClass(Inlinks.class);
+
+    JobClient.runJob(job);
+  }
+  
+  public static void main(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err.println("LinkDbReader <linkdb> {-dump <out_dir> | -url 
<url>)");
+      System.err.println("\t-dump <out_dir>\tdump whole link db to a text file 
in <out_dir>");
+      System.err.println("\t-url <url>\tprint information about <url> to 
System.out");
+      return;
+    }
+    
+    if (args[1].equals("-dump")) {
+      LinkDbReader.processDumpJob(args[0], args[2], NutchConf.get());
+    } else if (args[1].equals("-url")) {
+      LinkDbReader dbr = new LinkDbReader(NutchFileSystem.get(), new 
File(args[0]));
+      Inlinks links = dbr.getInlinks(new UTF8(args[2]));
+      if (links == null) {
+        System.out.println(" - no link information.");
+      } else {
+        for (int i = 0; i < links.size(); i++) {
+          System.out.println(links.get(i).toString());
+        }
+      }
+    } else {
+      System.err.println("Error: wrong argument " + args[1]);
+      return;
+    }
+  }
+}

Propchange: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDbReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java?rev=348440&view=auto
==============================================================================
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
 (added)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
 Wed Nov 23 05:55:30 2005
@@ -0,0 +1,32 @@
+/*
+ * Created on Nov 23, 2005
+ * Author: Andrzej Bialecki &lt;[EMAIL PROTECTED]&gt;
+ *
+ */
+package org.apache.nutch.searcher;
+
+import java.io.IOException;
+
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.LinkDbReader;
+import org.apache.nutch.fs.NutchFileSystem;
+import org.apache.nutch.io.UTF8;
+
+import java.io.File;
+
+public class LinkDbInlinks implements HitInlinks {
+  
+  private LinkDbReader linkdb = null;
+  
+  public LinkDbInlinks(NutchFileSystem fs, File dir) {
+    linkdb = new LinkDbReader(fs, dir);
+  }
+
+  public String[] getAnchors(HitDetails details) throws IOException {
+    return linkdb.getAnchors(new UTF8(details.getValue("url")));
+  }
+
+  public Inlinks getInlinks(HitDetails details) throws IOException {
+    return linkdb.getInlinks(new UTF8(details.getValue("url")));
+  }
+}

Propchange: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java?rev=348440&r1=348439&r2=348440&view=diff
==============================================================================
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java 
Wed Nov 23 05:55:30 2005
@@ -26,6 +26,7 @@
 import org.apache.nutch.parse.*;
 import org.apache.nutch.indexer.*;
 import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.LinkDbReader;
 
 /** 
  * One stop shopping for search-related functionality.
@@ -127,7 +128,7 @@
     this.content = segments;
 
     LOG.info("opening linkdb in " + linkDb);
-    this.linkDb = new LinkDbReader(fs, linkDb);
+    this.linkDb = new LinkDbInlinks(fs, linkDb);
   }
 
   private void init(DistributedSearch.Client client) {


Reply via email to