Author: mattmann
Date: Fri May 15 18:03:09 2015
New Revision: 1679613

URL: http://svn.apache.org/r1679613
Log:
NUTCH-2011 Endpoint to support realtime JSON output from the fetcher: 
Contributed by Sujen Shah <[email protected]> this closes #24.

Added:
    nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNode.java
    nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNodeDb.java
    
nutch/trunk/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
    nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java
    nutch/trunk/src/java/org/apache/nutch/service/resources/DbResource.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1679613&r1=1679612&r2=1679613&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri May 15 18:03:09 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2011 Endpoint to support realtime JSON output from the fetcher (Sujen 
Shah via mattmann)
+
 * NUTCH-2006 IndexingFiltersChecker to take custom metadata as input (jnioche)
 
 * NUTCH-2008 IndexerMapReduce to use single instance of NutchIndexAction for 
deletions (snagel)

Added: nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNode.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNode.java?rev=1679613&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNode.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNode.java Fri May 15 
18:03:09 2015
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.parse.Outlink;
+
+public class FetchNode {
+  private Text url = null;
+  private Outlink[] outlinks;
+  private int status = 0;
+  private String title = null;
+  private long fetchTime = 0;
+  
+  public Text getUrl() {
+    return url;
+  }
+  public void setUrl(Text url) {
+    this.url = url;
+  }
+  public Outlink[] getOutlinks() {
+    return outlinks;
+  }
+  public void setOutlinks(Outlink[] links) {
+    this.outlinks = links;
+  }
+  public int getStatus() {
+    return status;
+  }
+  public void setStatus(int status) {
+    this.status = status;
+  }
+  public String getTitle() {
+    return title;
+  }
+  public void setTitle(String title) {
+    this.title = title;
+  }
+  public long getFetchTime() {
+    return fetchTime;
+  }
+  public void setFetchTime(long fetchTime) {
+    this.fetchTime = fetchTime;
+  }  
+}
\ No newline at end of file

Added: nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNodeDb.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNodeDb.java?rev=1679613&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNodeDb.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNodeDb.java Fri May 15 
18:03:09 2015
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+
+public class FetchNodeDb {
+
+  private Map<Integer, FetchNode> fetchNodeDbMap;
+  private int index;
+  private static FetchNodeDb fetchNodeDbInstance = null;
+  
+  public FetchNodeDb(){    
+    fetchNodeDbMap = new ConcurrentHashMap<Integer, FetchNode>();
+    index = 1;
+  }
+  
+  public static FetchNodeDb getInstance(){
+    
+    if(fetchNodeDbInstance == null){
+      fetchNodeDbInstance = new FetchNodeDb();
+    }
+    return fetchNodeDbInstance;
+  }
+  
+  public void put(String url, FetchNode fetchNode){
+    System.out.println("FetchNodeDb : putting node - " + fetchNode.hashCode());
+    fetchNodeDbMap.put(index++, fetchNode);    
+  }  
+  public Map<Integer, FetchNode> getFetchNodeDb(){
+    return fetchNodeDbMap;
+  }
+}
\ No newline at end of file

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java?rev=1679613&r1=1679612&r2=1679613&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java Fri May 15 
18:03:09 2015
@@ -123,6 +123,9 @@ public class FetcherThread extends Threa
   private AtomicInteger pages;
 
   private AtomicLong bytes;
+  
+  //Used by the REST service
+  private FetchNode fetchNode;
 
   public FetcherThread(Configuration conf, AtomicInteger activeThreads, 
FetchItemQueues fetchQueues, 
       QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong 
lastRequestStart, Reporter reporter,
@@ -188,6 +191,9 @@ public class FetcherThread extends Threa
     try {
 
       while (true) {
+        // creating FetchNode for storing in FetchNodeDb
+        this.fetchNode = new FetchNode();
+        
         // check whether must be stopped
         if (isHalted()) {
           LOG.debug(getName() + " set to halted");
@@ -282,6 +288,11 @@ public class FetcherThread extends Threa
             ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
 
             String urlString = fit.url.toString();
+            
+            //used for FetchNode
+            fetchNode.setStatus(status.getCode());
+            fetchNode.setFetchTime(System.currentTimeMillis());
+            fetchNode.setUrl(fit.url);
 
             reporter.incrCounter("FetcherStatus", status.getName(), 1);
 
@@ -608,7 +619,12 @@ public class FetcherThread extends Threa
           } else {
             fromHost = null;
           }
-
+          
+          //used by fetchNode            
+          fetchNode.setOutlinks(links);
+          fetchNode.setTitle(parseData.getTitle());
+          FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), 
fetchNode);
+          
           int validCount = 0;
 
           // Process all outlinks, normalize, filter and deduplicate

Modified: nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java?rev=1679613&r1=1679612&r2=1679613&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java Fri May 15 
18:03:09 2015
@@ -24,6 +24,7 @@ import java.util.concurrent.BlockingQueu
 import java.util.concurrent.TimeUnit;
 
 import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider;
+
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.HelpFormatter;
 import org.apache.commons.cli.OptionBuilder;
@@ -36,6 +37,7 @@ import org.apache.cxf.jaxrs.JAXRSBinding
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.nutch.fetcher.FetchNodeDb;
 import org.apache.nutch.service.impl.ConfManagerImpl;
 import org.apache.nutch.service.impl.JobFactory;
 import org.apache.nutch.service.impl.JobManagerImpl;
@@ -69,6 +71,8 @@ public class NutchServer {
   private JobManager jobManager;
   private JAXRSServerFactoryBean sf; 
 
+  private static FetchNodeDb fetchNodeDb;
+  
   private static NutchServer server;
 
   static {
@@ -80,6 +84,7 @@ public class NutchServer {
     BlockingQueue<Runnable> runnables = 
Queues.newArrayBlockingQueue(JOB_CAPACITY);
     NutchServerPoolExecutor executor = new NutchServerPoolExecutor(10, 
JOB_CAPACITY, 1, TimeUnit.HOURS, runnables);
     jobManager = new JobManagerImpl(new JobFactory(), configManager, executor);
+    fetchNodeDb = FetchNodeDb.getInstance();
 
     sf = new JAXRSServerFactoryBean();
     BindingFactoryManager manager = 
sf.getBus().getExtension(BindingFactoryManager.class);
@@ -139,6 +144,10 @@ public class NutchServer {
     return jobManager;
   }
 
+  public FetchNodeDb getFetchNodeDb(){
+    return fetchNodeDb;
+  }
+  
   public boolean isRunning(){
     return running;
   }

Added: 
nutch/trunk/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java?rev=1679613&view=auto
==============================================================================
--- 
nutch/trunk/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
 (added)
+++ 
nutch/trunk/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
 Fri May 15 18:03:09 2015
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.model.response;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.nutch.parse.Outlink;
+
+public class FetchNodeDbInfo {
+  
+  private String url;
+  private int status;
+  private int numOfOutlinks;
+  private List<ChildNode> children = new ArrayList<ChildNode>();
+  
+  
+  public String getUrl() {
+    return url;
+  }
+
+
+  public void setUrl(String url) {
+    this.url = url;
+  }
+
+
+  public int getStatus() {
+    return status;
+  }
+
+
+  public void setStatus(int status) {
+    this.status = status;
+  }
+
+
+  public int getNumOfOutlinks() {
+    return numOfOutlinks;
+  }
+
+
+  public void setNumOfOutlinks(int numOfOutlinks) {
+    this.numOfOutlinks = numOfOutlinks;
+  }
+  
+  public void setChildNodes(Outlink[] links){
+    ChildNode childNode;
+    for(Outlink outlink: links){
+      childNode = new ChildNode(outlink.getToUrl(), outlink.getAnchor());
+      children.add(childNode);
+    }
+  }
+
+
+  private class ChildNode{
+    private String childUrl;
+    private String anchorText;
+    
+    public ChildNode(String childUrl, String anchorText){
+      this.childUrl = childUrl;
+      this.anchorText = anchorText;
+    }
+    
+    public String getAnchorText() {
+      return anchorText;
+    }
+    public void setAnchorText(String anchorText) {
+      this.anchorText = anchorText;
+    }
+    public String getChildUrl() {
+      return childUrl;
+    }
+    public void setChildUrl(String childUrl) {
+      this.childUrl = childUrl;
+    }
+  }
+
+
+  public List<ChildNode> getChildren() {
+    return children;
+  }
+
+
+  public void setChildren(List<ChildNode> children) {
+    this.children = children;
+  }
+  
+}

Modified: 
nutch/trunk/src/java/org/apache/nutch/service/resources/DbResource.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/resources/DbResource.java?rev=1679613&r1=1679612&r2=1679613&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/resources/DbResource.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/service/resources/DbResource.java Fri 
May 15 18:03:09 2015
@@ -17,18 +17,26 @@
 package org.apache.nutch.service.resources;
 
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Map;
 
 import javax.ws.rs.Consumes;
+import javax.ws.rs.DefaultValue;
+import javax.ws.rs.GET;
 import javax.ws.rs.POST;
 import javax.ws.rs.Path;
 import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
 import javax.ws.rs.core.MediaType;
 import javax.ws.rs.core.Response;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.CrawlDbReader;
+import org.apache.nutch.fetcher.FetchNode;
+import org.apache.nutch.fetcher.FetchNodeDb;
 import org.apache.nutch.service.model.request.DbQuery;
+import org.apache.nutch.service.model.response.FetchNodeDbInfo;
 
 @Path(value = "/db")
 public class DbResource extends AbstractResource {
@@ -55,7 +63,31 @@ public class DbResource extends Abstract
     return null;
 
   }    
-
+  
+  @GET
+  @Path(value="/fetchdb")
+  public List<FetchNodeDbInfo> fetchDb(@DefaultValue("0")@QueryParam("to")int 
to, @DefaultValue("0")@QueryParam("from")int from){
+    List<FetchNodeDbInfo> listOfFetchedNodes = new 
ArrayList<FetchNodeDbInfo>();
+    Map<Integer, FetchNode> fetchNodedbMap = 
FetchNodeDb.getInstance().getFetchNodeDb();
+    
+    if(to ==0 || to>fetchNodedbMap.size()){
+      to = fetchNodedbMap.size();
+    }
+    for(int i=from;i<=to;i++){
+      if(!fetchNodedbMap.containsKey(i)){
+        continue;
+      }
+      FetchNode node = fetchNodedbMap.get(i);
+      FetchNodeDbInfo fdbInfo = new FetchNodeDbInfo();
+      fdbInfo.setUrl(node.getUrl().toString());
+      fdbInfo.setStatus(node.getStatus());
+      fdbInfo.setNumOfOutlinks(node.getOutlinks().length);
+      fdbInfo.setChildNodes(node.getOutlinks());
+      listOfFetchedNodes.add(fdbInfo);
+    }
+    
+    return listOfFetchedNodes;
+  }
   @SuppressWarnings("resource")
   private Response crawlDbStats(Configuration conf, Map<String, String> args, 
String crawlId){
     CrawlDbReader dbr = new CrawlDbReader();


Reply via email to