Author: mattmann
Date: Sat Sep 19 05:13:41 2015
New Revision: 1703941
URL: http://svn.apache.org/viewvc?rev=1703941&view=rev
Log:
Fix for NUTCH-2099: Refactoring the REST endpoints for integration with webui
contributed by Sujen Shah <[email protected]> this closes #59.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
nutch/trunk/src/java/org/apache/nutch/service/model/request/JobConfig.java
nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java
nutch/trunk/src/java/org/apache/nutch/util/NutchTool.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Sep 19 05:13:41 2015
@@ -2,6 +2,9 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2099 Refactoring the REST endpoints for integration with
+ webui (Sujen Shah via mattmann)
+
* NUTCH-2098 Add null SeedUrl constructor (Aron Ahmadia via mattmann)
* NUTCH-2093 Indexing filters to use current signatures (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Sat Sep 19
05:13:41 2015
@@ -29,6 +29,7 @@ import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
@@ -236,10 +237,10 @@ public class CrawlDb extends NutchTool i
* Used for Nutch REST service
*/
@Override
- public Map<String, Object> run(Map<String, String> args, String crawlId)
throws Exception {
+ public Map<String, Object> run(Map<String, Object> args, String crawlId)
throws Exception {
Map<String, Object> results = new HashMap<String, Object>();
- String RESULT = "result";
+
boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING,
false);
boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
@@ -260,31 +261,69 @@ public class CrawlDb extends NutchTool i
if (args.containsKey("noAdditions")) {
additionsAllowed = false;
}
-
- String crawldb = crawlId+"/crawldb";
- String segment_dir = crawlId+"/segments";
- File segmentsDir = new File(segment_dir);
- File[] segmentsList = segmentsDir.listFiles();
- Arrays.sort(segmentsList, new Comparator<File>(){
- @Override
- public int compare(File f1, File f2) {
- if(f1.lastModified()>f2.lastModified())
- return -1;
- else
- return 0;
- }
- });
-
- dirs.add(new Path(segmentsList[0].getPath()));
-
+
+ Path crawlDb;
+ if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+ Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
+ if(crawldbPath instanceof Path) {
+ crawlDb = (Path) crawldbPath;
+ }
+ else {
+ crawlDb = new Path(crawldbPath.toString());
+ }
+ }
+ else {
+ crawlDb = new Path(crawlId+"/crawldb");
+ }
+
+ Path segmentsDir;
+ final FileSystem fs = FileSystem.get(getConf());
+ if(args.containsKey(Nutch.ARG_SEGMENTDIR)) {
+ Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
+ if(segDir instanceof Path) {
+ segmentsDir = (Path) segDir;
+ }
+ else {
+ segmentsDir = new Path(segDir.toString());
+ }
+ FileStatus[] paths = fs.listStatus(segmentsDir,
+ HadoopFSUtil.getPassDirectoriesFilter(fs));
+ dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
+ }
+
+ else if(args.containsKey(Nutch.ARG_SEGMENT)) {
+ Object segments = args.get(Nutch.ARG_SEGMENT);
+ ArrayList<String> segmentList = new ArrayList<String>();
+ if(segments instanceof ArrayList) {
+ segmentList = (ArrayList<String>)segments;
+ }
+ for(String segment: segmentList) {
+ dirs.add(new Path(segment));
+ }
+ }
+ else {
+ String segment_dir = crawlId+"/segments";
+ File dir = new File(segment_dir);
+ File[] segmentsList = dir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+ dirs.add(new Path(segmentsList[0].getPath()));
+ }
try {
- update(new Path(crawldb), dirs.toArray(new Path[dirs.size()]), normalize,
+ update(crawlDb, dirs.toArray(new Path[dirs.size()]), normalize,
filter, additionsAllowed, force);
- results.put(RESULT, Integer.toString(0));
+ results.put(Nutch.VAL_RESULT, Integer.toString(0));
return results;
} catch (Exception e) {
LOG.error("CrawlDb update: " + StringUtils.stringifyException(e));
- results.put(RESULT, Integer.toString(-1));
+ results.put(Nutch.VAL_RESULT, Integer.toString(-1));
return results;
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Sat Sep
19 05:13:41 2015
@@ -44,6 +44,7 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
@@ -298,17 +299,19 @@ public class DeduplicationJob extends Nu
}
@Override
- public Map<String, Object> run(Map<String, String> args, String crawlId)
throws Exception {
-// if(args.size()<1){
-// throw new IllegalArgumentException("Required argument <crawldb>");
-// }
+ public Map<String, Object> run(Map<String, Object> args, String crawlId)
throws Exception {
Map<String, Object> results = new HashMap<String, Object>();
- String RESULT = "result";
String[] arg = new String[1];
- String crawldb = crawlId+"/crawldb";
+ String crawldb;
+ if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+ crawldb = (String)args.get(Nutch.ARG_CRAWLDB);
+ }
+ else {
+ crawldb = crawlId+"/crawldb";
+ }
arg[0] = crawldb;
int res = run(arg);
- results.put(RESULT, Integer.toString(res));
+ results.put(Nutch.VAL_RESULT, Integer.toString(res));
return results;
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Sat Sep 19
05:13:41 2015
@@ -752,15 +752,10 @@ public class Generator extends NutchTool
}
@Override
- public Map<String, Object> run(Map<String, String> args, String crawlId)
throws Exception {
-
+ public Map<String, Object> run(Map<String, Object> args, String crawlId)
throws Exception {
Map<String, Object> results = new HashMap<String, Object>();
- String RESULT = "result";
- String crawldb = (args.containsKey("crawldb")) ? args.get("crawldb") :
crawlId+"/crawldb";
- Path dbDir = new Path(crawldb);
- String segments_dir = (args.containsKey("segment_dir")) ?
args.get("segments_dir") : crawlId+"/segments";
- Path segmentsDir = new Path(segments_dir);
+
long curTime = System.currentTimeMillis();
long topN = Long.MAX_VALUE;
int numFetchers = -1;
@@ -769,15 +764,42 @@ public class Generator extends NutchTool
boolean force = false;
int maxNumSegments = 1;
+ Path crawlDb;
+ if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+ Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
+ if(crawldbPath instanceof Path) {
+ crawlDb = (Path) crawldbPath;
+ }
+ else {
+ crawlDb = new Path(crawldbPath.toString());
+ }
+ }
+ else {
+ crawlDb = new Path(crawlId+"/crawldb");
+ }
+
+ Path segmentsDir;
+ if(args.containsKey(Nutch.ARG_SEGMENTDIR)) {
+ Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
+ if(segDir instanceof Path) {
+ segmentsDir = (Path) segDir;
+ }
+ else {
+ segmentsDir = new Path(segDir.toString());
+ }
+ }
+ else {
+ segmentsDir = new Path(crawlId+"/segments");
+ }
if (args.containsKey("topN")) {
- topN = Long.parseLong(args.get("topN"));
+ topN = Long.parseLong((String)args.get("topN"));
}
if (args.containsKey("numFetchers")) {
- numFetchers = Integer.parseInt(args.get("numFetchers"));
+ numFetchers = Integer.parseInt((String)args.get("numFetchers"));
}
if (args.containsKey("adddays")) {
- long numDays = Integer.parseInt(args.get("adddays"));
+ long numDays = Integer.parseInt((String)args.get("adddays"));
curTime += numDays * 1000L * 60 * 60 * 24;
}
if (args.containsKey("noFilter")) {
@@ -790,23 +812,23 @@ public class Generator extends NutchTool
force = true;
}
if (args.containsKey("maxNumSegments")) {
- maxNumSegments = Integer.parseInt(args.get("maxNumSegments"));
+ maxNumSegments = Integer.parseInt((String)args.get("maxNumSegments"));
}
try {
- Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime,
+ Path[] segs = generate(crawlDb, segmentsDir, numFetchers, topN, curTime,
filter, norm, force, maxNumSegments);
if (segs == null){
- results.put(RESULT, Integer.toString(1));
+ results.put(Nutch.VAL_RESULT, Integer.toString(1));
return results;
}
} catch (Exception e) {
LOG.error("Generator: " + StringUtils.stringifyException(e));
- results.put(RESULT, Integer.toString(-1));
+ results.put(Nutch.VAL_RESULT, Integer.toString(-1));
return results;
}
- results.put(RESULT, Integer.toString(0));
+ results.put(Nutch.VAL_RESULT, Integer.toString(0));
return results;
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Sat Sep 19
05:13:41 2015
@@ -388,17 +388,37 @@ public class Injector extends NutchTool
/**
* Used by the Nutch REST service
*/
- public Map<String, Object> run(Map<String, String> args, String crawlId)
throws Exception {
+ public Map<String, Object> run(Map<String, Object> args, String crawlId)
throws Exception {
if(args.size()<1){
throw new IllegalArgumentException("Required arguments <url_dir>");
}
Map<String, Object> results = new HashMap<String, Object>();
- String RESULT = "result";
- String crawldb = crawlId+"/crawldb";
- String url_dir = args.get("url_dir");
- inject(new Path(crawldb), new Path(url_dir));
- results.put(RESULT, Integer.toString(0));
+ Path crawlDb;
+ if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+ Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
+ if(crawldbPath instanceof Path) {
+ crawlDb = (Path) crawldbPath;
+ }
+ else {
+ crawlDb = new Path(crawldbPath.toString());
+ }
+ }
+ else {
+ crawlDb = new Path(crawlId+"/crawldb");
+ }
+
+ Path input;
+ Object path = args.get(Nutch.ARG_SEEDDIR);
+ if(path instanceof Path) {
+ input = (Path) path;
+ }
+ else {
+ input = new Path(path.toString());
+ }
+
+ inject(crawlDb, input);
+ results.put(Nutch.VAL_RESULT, Integer.toString(0));
return results;
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Sat Sep 19 05:13:41
2015
@@ -31,6 +31,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.*;
@@ -342,15 +343,25 @@ public class LinkDb extends NutchTool im
* Used for Nutch REST service
*/
@Override
- public Map<String, Object> run(Map<String, String> args, String crawlId)
throws Exception {
-// if (args.size() < 2) {
-// throw new IllegalArgumentException("Required arguments <linkdb> (-dir
<segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
-// }
-
+ public Map<String, Object> run(Map<String, Object> args, String crawlId)
throws Exception {
+
Map<String, Object> results = new HashMap<String, Object>();
- String RESULT = "result";
- String linkdb = crawlId + "/linkdb";
- Path db = new Path(linkdb);
+
+ Path linkdb;
+ if(args.containsKey(Nutch.ARG_LINKDB)) {
+ Object path = args.get(Nutch.ARG_LINKDB);
+ if(path instanceof Path) {
+ linkdb = (Path) path;
+ }
+ else {
+ linkdb = new Path(path.toString());
+ }
+ }
+ else {
+ linkdb = new Path(crawlId+"/linkdb");
+ }
+
+
ArrayList<Path> segs = new ArrayList<Path>();
boolean filter = true;
boolean normalize = true;
@@ -364,26 +375,53 @@ public class LinkDb extends NutchTool im
if (args.containsKey("force")) {
force = true;
}
- String segment_dir = crawlId+"/segments";
- File segmentsDir = new File(segment_dir);
- File[] segmentsList = segmentsDir.listFiles();
- Arrays.sort(segmentsList, new Comparator<File>(){
- @Override
- public int compare(File f1, File f2) {
- if(f1.lastModified()>f2.lastModified())
- return -1;
- else
- return 0;
- }
- });
- segs.add(new Path(segmentsList[0].getPath()));
+
+ Path segmentsDir;
+ final FileSystem fs = FileSystem.get(getConf());
+ if(args.containsKey(Nutch.ARG_SEGMENTDIR)) {
+ Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
+ if(segDir instanceof Path) {
+ segmentsDir = (Path) segDir;
+ }
+ else {
+ segmentsDir = new Path(segDir.toString());
+ }
+ FileStatus[] paths = fs.listStatus(segmentsDir,
+ HadoopFSUtil.getPassDirectoriesFilter(fs));
+ segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
+ }
+ else if(args.containsKey(Nutch.ARG_SEGMENT)) {
+ Object segments = args.get(Nutch.ARG_SEGMENT);
+ ArrayList<String> segmentList = new ArrayList<String>();
+ if(segments instanceof ArrayList) {
+ segmentList = (ArrayList<String>)segments;
+ }
+ for(String segment: segmentList) {
+ segs.add(new Path(segment));
+ }
+ }
+ else {
+ String segment_dir = crawlId+"/segments";
+ File dir = new File(segment_dir);
+ File[] segmentsList = dir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+ segs.add(new Path(segmentsList[0].getPath()));
+ }
try {
- invert(db, segs.toArray(new Path[segs.size()]), normalize, filter,
force);
- results.put(RESULT, Integer.toString(0));
+ invert(linkdb, segs.toArray(new Path[segs.size()]), normalize, filter,
force);
+ results.put(Nutch.VAL_RESULT, Integer.toString(0));
return results;
} catch (Exception e) {
LOG.error("LinkDb: " + StringUtils.stringifyException(e));
- results.put(RESULT, Integer.toString(-1));
+ results.put(Nutch.VAL_RESULT, Integer.toString(-1));
return results;
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Sep 19
05:13:41 2015
@@ -556,41 +556,53 @@ public class Fetcher extends NutchTool i
}
@Override
- public Map<String, Object> run(Map<String, String> args, String crawlId)
throws Exception {
+ public Map<String, Object> run(Map<String, Object> args, String crawlId)
throws Exception {
Map<String, Object> results = new HashMap<String, Object>();
- String RESULT = "result";
- String segment_dir = crawlId+"/segments";
- File segmentsDir = new File(segment_dir);
- File[] segmentsList = segmentsDir.listFiles();
- Arrays.sort(segmentsList, new Comparator<File>(){
- @Override
- public int compare(File f1, File f2) {
- if(f1.lastModified()>f2.lastModified())
- return -1;
- else
- return 0;
- }
- });
-
- Path segment = new Path(segmentsList[0].getPath());
+
+ Path segment;
+ if(args.containsKey(Nutch.ARG_SEGMENT)) {
+ Object seg = args.get(Nutch.ARG_SEGMENT);
+ if(seg instanceof Path) {
+ segment = (Path) seg;
+ }
+ else {
+ segment = new Path(seg.toString());
+ }
+ }
+ else {
+ String segment_dir = crawlId+"/segments";
+ File segmentsDir = new File(segment_dir);
+ File[] segmentsList = segmentsDir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+ segment = new Path(segmentsList[0].getPath());
+ }
+
int threads = getConf().getInt("fetcher.threads.fetch", 10);
boolean parsing = false;
// parse command line
if (args.containsKey("threads")) { // found -threads option
- threads = Integer.parseInt(args.get("threads"));
+ threads = Integer.parseInt((String)args.get("threads"));
}
getConf().setInt("fetcher.threads.fetch", threads);
try {
fetch(segment, threads);
- results.put(RESULT, Integer.toString(0));
+ results.put(Nutch.VAL_RESULT, Integer.toString(0));
return results;
} catch (Exception e) {
LOG.error("Fetcher: " + StringUtils.stringifyException(e));
- results.put(RESULT, Integer.toString(-1));
+ results.put(Nutch.VAL_RESULT, Integer.toString(-1));
return results;
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java Sat Sep 19
05:13:41 2015
@@ -28,6 +28,7 @@ import java.util.Locale;
import java.util.Map;
import java.util.Random;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.segment.SegmentChecker;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
@@ -235,7 +236,7 @@ public class IndexingJob extends NutchTo
//Used for REST API
@Override
- public Map<String, Object> run(Map<String, String> args, String crawlId)
throws Exception {
+ public Map<String, Object> run(Map<String, Object> args, String crawlId)
throws Exception {
boolean noCommit = false;
boolean deleteGone = false;
boolean filter = false;
@@ -244,20 +245,50 @@ public class IndexingJob extends NutchTo
String params= null;
Configuration conf = getConf();
- String crawldb = crawlId+"/crawldb";
- Path crawlDb = new Path(crawldb);
- Path linkDb = null;
+ Path crawlDb;
+ if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+ Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
+ if(crawldbPath instanceof Path) {
+ crawlDb = (Path) crawldbPath;
+ }
+ else {
+ crawlDb = new Path(crawldbPath.toString());
+ }
+ }
+ else {
+ crawlDb = new Path(crawlId+"/crawldb");
+ }
+
+ Path linkdb = null;
List<Path> segments = new ArrayList<Path>();
- if(args.containsKey("linkdb")){
- linkDb = new Path(crawlId+"/linkdb");
+ if(args.containsKey(Nutch.ARG_LINKDB)){
+ if(args.containsKey(Nutch.ARG_LINKDB)) {
+ Object path = args.get(Nutch.ARG_LINKDB);
+ if(path instanceof Path) {
+ linkdb = (Path) path;
+ }
+ else {
+ linkdb = new Path(path.toString());
+ }
+ }
+ else {
+ linkdb = new Path(crawlId+"/linkdb");
+ }
}
- if(args.containsKey("dir")){
+ if(args.containsKey(Nutch.ARG_SEGMENTDIR)){
isSegment = true;
- Path dir = new Path(crawlId+"/segments");
- FileSystem fs = dir.getFileSystem(getConf());
- FileStatus[] fstats = fs.listStatus(dir,
+ Path segmentsDir;
+ Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
+ if(segDir instanceof Path) {
+ segmentsDir = (Path) segDir;
+ }
+ else {
+ segmentsDir = new Path(segDir.toString());
+ }
+ FileSystem fs = segmentsDir.getFileSystem(getConf());
+ FileStatus[] fstats = fs.listStatus(segmentsDir,
HadoopFSUtil.getPassDirectoriesFilter(fs));
Path[] files = HadoopFSUtil.getPaths(fstats);
for (Path p : files) {
@@ -266,15 +297,19 @@ public class IndexingJob extends NutchTo
}
}
}
-
- if(args.containsKey("segments")){
+
+ if(args.containsKey(Nutch.ARG_SEGMENT)){
isSegment = true;
- String listOfSegments[] = args.get("segments").split(",");
- for(String s: listOfSegments){
- segments.add(new Path(s));
+ Object seg = args.get(Nutch.ARG_SEGMENT);
+ ArrayList<String> segmentList = new ArrayList<String>();
+ if(seg instanceof ArrayList) {
+ segmentList = (ArrayList<String>)seg;
+ }
+ for(String segment: segmentList) {
+ segments.add(new Path(segment));
}
}
-
+
if(!isSegment){
String segment_dir = crawlId+"/segments";
File segmentsDir = new File(segment_dir);
@@ -288,11 +323,10 @@ public class IndexingJob extends NutchTo
return 0;
}
});
-
Path segment = new Path(segmentsList[0].getPath());
segments.add(segment);
}
-
+
if(args.containsKey("noCommit")){
noCommit = true;
}
@@ -306,13 +340,13 @@ public class IndexingJob extends NutchTo
filter = true;
}
if(args.containsKey("params")){
- params = args.get("params");
+ params = (String)args.get("params");
}
setConf(conf);
- index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter,
+ index(crawlDb, linkdb, segments, noCommit, deleteGone, params, filter,
normalize);
Map<String, Object> results = new HashMap<String, Object>();
- results.put("result", 0);
+ results.put(Nutch.VAL_RESULT, 0);
return results;
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Sat Sep 19
05:13:41 2015
@@ -80,4 +80,17 @@ public interface Nutch {
public static final String STAT_PROGRESS = "progress";
/**Used by Nutch REST service */
public static final String CRAWL_ID_KEY = "storage.crawl.id";
+ /** Argument key to specify location of the seed url dir for the REST
endpoints **/
+ public static final String ARG_SEEDDIR = "url_dir";
+ /** Argument key to specify the location of crawldb for the REST
endpoints **/
+ public static final String ARG_CRAWLDB = "crawldb";
+ /** Argument key to specify the location of linkdb for the REST
endpoints **/
+ public static final String ARG_LINKDB = "linkdb";
+ /** Name of the key used in the Result Map sent back by the REST
endpoint **/
+ public static final String VAL_RESULT = "result";
+ /** Argument key to specify the location of a directory of segments for
the REST endpoints.
+ * Similar to the -dir command in the bin/nutch script **/
+ public static final String ARG_SEGMENTDIR = "segment_dir";
+ /** Argument key to specify the location of individual segment for the
REST endpoints **/
+ public static final String ARG_SEGMENT = "segment";
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Sat Sep 19
05:13:41 2015
@@ -268,33 +268,43 @@ public class ParseSegment extends NutchT
/*
* Used for Nutch REST service
*/
- public Map<String, Object> run(Map<String, String> args, String crawlId)
throws Exception {
+ public Map<String, Object> run(Map<String, Object> args, String crawlId)
throws Exception {
Map<String, Object> results = new HashMap<String, Object>();
- String RESULT = "result";
+ Path segment;
+ if(args.containsKey(Nutch.ARG_SEGMENT)) {
+ Object seg = args.get(Nutch.ARG_SEGMENT);
+ if(seg instanceof Path) {
+ segment = (Path) seg;
+ }
+ else {
+ segment = new Path(seg.toString());
+ }
+ }
+ else {
+ String segment_dir = crawlId+"/segments";
+ File segmentsDir = new File(segment_dir);
+ File[] segmentsList = segmentsDir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+ segment = new Path(segmentsList[0].getPath());
+ }
+
if (args.containsKey("nofilter")) {
getConf().setBoolean("parse.filter.urls", false);
}
if (args.containsKey("nonormalize")) {
getConf().setBoolean("parse.normalize.urls", false);
}
-
- String segment_dir = crawlId+"/segments";
- File segmentsDir = new File(segment_dir);
- File[] segmentsList = segmentsDir.listFiles();
- Arrays.sort(segmentsList, new Comparator<File>(){
- @Override
- public int compare(File f1, File f2) {
- if(f1.lastModified()>f2.lastModified())
- return -1;
- else
- return 0;
- }
- });
-
- Path segment = new Path(segmentsList[0].getPath());
parse(segment);
- results.put(RESULT, Integer.toString(0));
+ results.put(Nutch.VAL_RESULT, Integer.toString(0));
return results;
}
}
Modified:
nutch/trunk/src/java/org/apache/nutch/service/model/request/JobConfig.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/model/request/JobConfig.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/model/request/JobConfig.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/service/model/request/JobConfig.java
Sat Sep 19 05:13:41 2015
@@ -27,7 +27,7 @@ public class JobConfig {
private JobType type;
private String confId;
private String jobClassName;
- private Map<String, String> args;
+ private Map<String, Object> args;
public String getCrawlId() {
return crawlId;
@@ -53,11 +53,11 @@ public class JobConfig {
this.confId = confId;
}
- public Map<String, String> getArgs() {
+ public Map<String, Object> getArgs() {
return args;
}
- public void setArgs(Map<String, String> args) {
+ public void setArgs(Map<String, Object> args) {
this.args = args;
}
Modified:
nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java
Sat Sep 19 05:13:41 2015
@@ -35,7 +35,7 @@ public class JobInfo {
private String id;
private JobType type;
private String confId;
- private Map<String, String> args;
+ private Map<String, Object> args;
private Map<String, Object> result;
private State state;
private String msg;
@@ -69,10 +69,10 @@ public class JobInfo {
public void setConfId(String confId) {
this.confId = confId;
}
- public Map<String, String> getArgs() {
+ public Map<String, Object> getArgs() {
return args;
}
- public void setArgs(Map<String, String> args) {
+ public void setArgs(Map<String, Object> args) {
this.args = args;
}
public Map<String, Object> getResult() {
Modified: nutch/trunk/src/java/org/apache/nutch/util/NutchTool.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NutchTool.java?rev=1703941&r1=1703940&r2=1703941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/NutchTool.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/NutchTool.java Sat Sep 19
05:13:41 2015
@@ -39,7 +39,7 @@ public abstract class NutchTool extends
/**
* Runs the tool, using a map of arguments. May return results, or null.
*/
- public abstract Map<String, Object> run(Map<String, String> args, String
crawlId)
+ public abstract Map<String, Object> run(Map<String, Object> args, String
crawlId)
throws Exception;
public NutchTool(Configuration conf){