Author: mattmann
Date: Fri May 15 18:03:09 2015
New Revision: 1679613
URL: http://svn.apache.org/r1679613
Log:
NUTCH-2011 Endpoint to support realtime JSON output from the fetcher:
Contributed by Sujen Shah <[email protected]> this closes #24.
Added:
nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNode.java
nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNodeDb.java
nutch/trunk/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java
nutch/trunk/src/java/org/apache/nutch/service/resources/DbResource.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1679613&r1=1679612&r2=1679613&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri May 15 18:03:09 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2011 Endpoint to support realtime JSON output from the fetcher (Sujen
Shah via mattmann)
+
* NUTCH-2006 IndexingFiltersChecker to take custom metadata as input (jnioche)
* NUTCH-2008 IndexerMapReduce to use single instance of NutchIndexAction for
deletions (snagel)
Added: nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNode.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNode.java?rev=1679613&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNode.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNode.java Fri May 15
18:03:09 2015
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.parse.Outlink;
+
+public class FetchNode {
+ private Text url = null;
+ private Outlink[] outlinks;
+ private int status = 0;
+ private String title = null;
+ private long fetchTime = 0;
+
+ public Text getUrl() {
+ return url;
+ }
+ public void setUrl(Text url) {
+ this.url = url;
+ }
+ public Outlink[] getOutlinks() {
+ return outlinks;
+ }
+ public void setOutlinks(Outlink[] links) {
+ this.outlinks = links;
+ }
+ public int getStatus() {
+ return status;
+ }
+ public void setStatus(int status) {
+ this.status = status;
+ }
+ public String getTitle() {
+ return title;
+ }
+ public void setTitle(String title) {
+ this.title = title;
+ }
+ public long getFetchTime() {
+ return fetchTime;
+ }
+ public void setFetchTime(long fetchTime) {
+ this.fetchTime = fetchTime;
+ }
+}
\ No newline at end of file
Added: nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNodeDb.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNodeDb.java?rev=1679613&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNodeDb.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetchNodeDb.java Fri May 15
18:03:09 2015
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+
+public class FetchNodeDb {
+
+ private Map<Integer, FetchNode> fetchNodeDbMap;
+ private int index;
+ private static FetchNodeDb fetchNodeDbInstance = null;
+
+ public FetchNodeDb(){
+ fetchNodeDbMap = new ConcurrentHashMap<Integer, FetchNode>();
+ index = 1;
+ }
+
+ public static FetchNodeDb getInstance(){
+
+ if(fetchNodeDbInstance == null){
+ fetchNodeDbInstance = new FetchNodeDb();
+ }
+ return fetchNodeDbInstance;
+ }
+
+ public void put(String url, FetchNode fetchNode){
+ System.out.println("FetchNodeDb : putting node - " + fetchNode.hashCode());
+ fetchNodeDbMap.put(index++, fetchNode);
+ }
+ public Map<Integer, FetchNode> getFetchNodeDb(){
+ return fetchNodeDbMap;
+ }
+}
\ No newline at end of file
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java?rev=1679613&r1=1679612&r2=1679613&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java Fri May 15
18:03:09 2015
@@ -123,6 +123,9 @@ public class FetcherThread extends Threa
private AtomicInteger pages;
private AtomicLong bytes;
+
+ //Used by the REST service
+ private FetchNode fetchNode;
public FetcherThread(Configuration conf, AtomicInteger activeThreads,
FetchItemQueues fetchQueues,
QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong
lastRequestStart, Reporter reporter,
@@ -188,6 +191,9 @@ public class FetcherThread extends Threa
try {
while (true) {
+ // creating FetchNode for storing in FetchNodeDb
+ this.fetchNode = new FetchNode();
+
// check whether must be stopped
if (isHalted()) {
LOG.debug(getName() + " set to halted");
@@ -282,6 +288,11 @@ public class FetcherThread extends Threa
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
String urlString = fit.url.toString();
+
+ //used for FetchNode
+ fetchNode.setStatus(status.getCode());
+ fetchNode.setFetchTime(System.currentTimeMillis());
+ fetchNode.setUrl(fit.url);
reporter.incrCounter("FetcherStatus", status.getName(), 1);
@@ -608,7 +619,12 @@ public class FetcherThread extends Threa
} else {
fromHost = null;
}
-
+
+ //used by fetchNode
+ fetchNode.setOutlinks(links);
+ fetchNode.setTitle(parseData.getTitle());
+ FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(),
fetchNode);
+
int validCount = 0;
// Process all outlinks, normalize, filter and deduplicate
Modified: nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java?rev=1679613&r1=1679612&r2=1679613&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java Fri May 15
18:03:09 2015
@@ -24,6 +24,7 @@ import java.util.concurrent.BlockingQueu
import java.util.concurrent.TimeUnit;
import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider;
+
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
@@ -36,6 +37,7 @@ import org.apache.cxf.jaxrs.JAXRSBinding
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.nutch.fetcher.FetchNodeDb;
import org.apache.nutch.service.impl.ConfManagerImpl;
import org.apache.nutch.service.impl.JobFactory;
import org.apache.nutch.service.impl.JobManagerImpl;
@@ -69,6 +71,8 @@ public class NutchServer {
private JobManager jobManager;
private JAXRSServerFactoryBean sf;
+ private static FetchNodeDb fetchNodeDb;
+
private static NutchServer server;
static {
@@ -80,6 +84,7 @@ public class NutchServer {
BlockingQueue<Runnable> runnables =
Queues.newArrayBlockingQueue(JOB_CAPACITY);
NutchServerPoolExecutor executor = new NutchServerPoolExecutor(10,
JOB_CAPACITY, 1, TimeUnit.HOURS, runnables);
jobManager = new JobManagerImpl(new JobFactory(), configManager, executor);
+ fetchNodeDb = FetchNodeDb.getInstance();
sf = new JAXRSServerFactoryBean();
BindingFactoryManager manager =
sf.getBus().getExtension(BindingFactoryManager.class);
@@ -139,6 +144,10 @@ public class NutchServer {
return jobManager;
}
+ public FetchNodeDb getFetchNodeDb(){
+ return fetchNodeDb;
+ }
+
public boolean isRunning(){
return running;
}
Added:
nutch/trunk/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java?rev=1679613&view=auto
==============================================================================
---
nutch/trunk/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
(added)
+++
nutch/trunk/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
Fri May 15 18:03:09 2015
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.model.response;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.nutch.parse.Outlink;
+
+public class FetchNodeDbInfo {
+
+ private String url;
+ private int status;
+ private int numOfOutlinks;
+ private List<ChildNode> children = new ArrayList<ChildNode>();
+
+
+ public String getUrl() {
+ return url;
+ }
+
+
+ public void setUrl(String url) {
+ this.url = url;
+ }
+
+
+ public int getStatus() {
+ return status;
+ }
+
+
+ public void setStatus(int status) {
+ this.status = status;
+ }
+
+
+ public int getNumOfOutlinks() {
+ return numOfOutlinks;
+ }
+
+
+ public void setNumOfOutlinks(int numOfOutlinks) {
+ this.numOfOutlinks = numOfOutlinks;
+ }
+
+ public void setChildNodes(Outlink[] links){
+ ChildNode childNode;
+ for(Outlink outlink: links){
+ childNode = new ChildNode(outlink.getToUrl(), outlink.getAnchor());
+ children.add(childNode);
+ }
+ }
+
+
+ private class ChildNode{
+ private String childUrl;
+ private String anchorText;
+
+ public ChildNode(String childUrl, String anchorText){
+ this.childUrl = childUrl;
+ this.anchorText = anchorText;
+ }
+
+ public String getAnchorText() {
+ return anchorText;
+ }
+ public void setAnchorText(String anchorText) {
+ this.anchorText = anchorText;
+ }
+ public String getChildUrl() {
+ return childUrl;
+ }
+ public void setChildUrl(String childUrl) {
+ this.childUrl = childUrl;
+ }
+ }
+
+
+ public List<ChildNode> getChildren() {
+ return children;
+ }
+
+
+ public void setChildren(List<ChildNode> children) {
+ this.children = children;
+ }
+
+}
Modified:
nutch/trunk/src/java/org/apache/nutch/service/resources/DbResource.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/resources/DbResource.java?rev=1679613&r1=1679612&r2=1679613&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/resources/DbResource.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/service/resources/DbResource.java Fri
May 15 18:03:09 2015
@@ -17,18 +17,26 @@
package org.apache.nutch.service.resources;
+import java.util.ArrayList;
+import java.util.List;
import java.util.Map;
import javax.ws.rs.Consumes;
+import javax.ws.rs.DefaultValue;
+import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDbReader;
+import org.apache.nutch.fetcher.FetchNode;
+import org.apache.nutch.fetcher.FetchNodeDb;
import org.apache.nutch.service.model.request.DbQuery;
+import org.apache.nutch.service.model.response.FetchNodeDbInfo;
@Path(value = "/db")
public class DbResource extends AbstractResource {
@@ -55,7 +63,31 @@ public class DbResource extends Abstract
return null;
}
-
+
+ @GET
+ @Path(value="/fetchdb")
+ public List<FetchNodeDbInfo> fetchDb(@DefaultValue("0")@QueryParam("to")int
to, @DefaultValue("0")@QueryParam("from")int from){
+ List<FetchNodeDbInfo> listOfFetchedNodes = new
ArrayList<FetchNodeDbInfo>();
+ Map<Integer, FetchNode> fetchNodedbMap =
FetchNodeDb.getInstance().getFetchNodeDb();
+
+ if(to ==0 || to>fetchNodedbMap.size()){
+ to = fetchNodedbMap.size();
+ }
+ for(int i=from;i<=to;i++){
+ if(!fetchNodedbMap.containsKey(i)){
+ continue;
+ }
+ FetchNode node = fetchNodedbMap.get(i);
+ FetchNodeDbInfo fdbInfo = new FetchNodeDbInfo();
+ fdbInfo.setUrl(node.getUrl().toString());
+ fdbInfo.setStatus(node.getStatus());
+ fdbInfo.setNumOfOutlinks(node.getOutlinks().length);
+ fdbInfo.setChildNodes(node.getOutlinks());
+ listOfFetchedNodes.add(fdbInfo);
+ }
+
+ return listOfFetchedNodes;
+ }
@SuppressWarnings("resource")
private Response crawlDbStats(Configuration conf, Map<String, String> args,
String crawlId){
CrawlDbReader dbr = new CrawlDbReader();