Author: mattmann
Date: Tue Jun 9 06:01:46 2015
New Revision: 1684316
URL: http://svn.apache.org/r1684316
Log:
Fix for NUTCH-2037: Job endpoint to support Indexing from the REST API
contributed by Sujen Shah <[email protected]> this closes #29
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
nutch/trunk/src/java/org/apache/nutch/service/impl/JobFactory.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1684316&r1=1684315&r2=1684316&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jun 9 06:01:46 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2037 Job endpoint to support Indexing from the REST API (Sujen Shah
via mattmann)
+
* NUTCH-2017 Remove debug log from MimeUtil (snagel)
* NUTCH-2027 seed list REST endpoint for Nutch 1.10 (Asitang Mishra via
mattmann)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1684316&r1=1684315&r2=1684316&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java Tue Jun 9
06:01:46 2015
@@ -16,16 +16,20 @@
*/
package org.apache.nutch.indexer;
+import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
import java.util.List;
import java.util.Locale;
+import java.util.Map;
import java.util.Random;
import org.apache.nutch.segment.SegmentChecker;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -40,6 +44,7 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -48,7 +53,7 @@ import org.slf4j.LoggerFactory;
* Generic indexer which relies on the plugins implementing IndexWriter
**/
-public class IndexingJob extends Configured implements Tool {
+public class IndexingJob extends NutchTool implements Tool {
public static Logger LOG = LoggerFactory.getLogger(IndexingJob.class);
@@ -197,4 +202,88 @@ public class IndexingJob extends Configu
new IndexingJob(), args);
System.exit(res);
}
+
+
+ //Used for REST API
+ @Override
+ public Map<String, Object> run(Map<String, String> args, String crawlId)
throws Exception {
+ boolean noCommit = false;
+ boolean deleteGone = false;
+ boolean filter = false;
+ boolean normalize = false;
+ boolean isSegment = false;
+ String params= null;
+ Configuration conf = getConf();
+
+ String crawldb = crawlId+"/crawldb";
+ Path crawlDb = new Path(crawldb);
+ Path linkDb = null;
+ List<Path> segments = new ArrayList<Path>();
+
+ if(args.containsKey("linkdb")){
+ linkDb = new Path(crawlId+"/linkdb");
+ }
+
+ if(args.containsKey("dir")){
+ isSegment = true;
+ Path dir = new Path(crawlId+"/segments");
+ FileSystem fs = dir.getFileSystem(getConf());
+ FileStatus[] fstats = fs.listStatus(dir,
+ HadoopFSUtil.getPassDirectoriesFilter(fs));
+ Path[] files = HadoopFSUtil.getPaths(fstats);
+ for (Path p : files) {
+ if (SegmentChecker.isIndexable(p,fs)) {
+ segments.add(p);
+ }
+ }
+ }
+
+ if(args.containsKey("segments")){
+ isSegment = true;
+ String listOfSegments[] = args.get("segments").split(",");
+ for(String s: listOfSegments){
+ segments.add(new Path(s));
+ }
+ }
+
+ if(!isSegment){
+ String segment_dir = crawlId+"/segments";
+ File segmentsDir = new File(segment_dir);
+ File[] segmentsList = segmentsDir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+
+ Path segment = new Path(segmentsList[0].getPath());
+ segments.add(segment);
+ }
+
+ if(args.containsKey("noCommit")){
+ noCommit = true;
+ }
+ if(args.containsKey("deleteGone")){
+ deleteGone = true;
+ }
+ if(args.containsKey("normalize")){
+ normalize = true;
+ }
+ if(args.containsKey("filter")){
+ filter = true;
+ }
+ if(args.containsKey("params")){
+ params = args.get("params");
+ }
+ setConf(conf);
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter,
+ normalize);
+ Map<String, Object> results = new HashMap<String, Object>();
+ results.put("result", 0);
+ return results;
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/service/impl/JobFactory.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/impl/JobFactory.java?rev=1684316&r1=1684315&r2=1684316&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/impl/JobFactory.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/service/impl/JobFactory.java Tue Jun
9 06:01:46 2015
@@ -28,6 +28,7 @@ import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.Injector;
import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.fetcher.Fetcher;
+import org.apache.nutch.indexer.IndexingJob;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.util.NutchTool;
@@ -42,6 +43,7 @@ public class JobFactory {
typeToClass.put(JobType.GENERATE, Generator.class);
typeToClass.put(JobType.FETCH, Fetcher.class);
typeToClass.put(JobType.PARSE, ParseSegment.class);
+ typeToClass.put(JobType.INDEX, IndexingJob.class);
typeToClass.put(JobType.UPDATEDB, CrawlDb.class);
typeToClass.put(JobType.INVERTLINKS, LinkDb.class);
typeToClass.put(JobType.DEDUP, DeduplicationJob.class);