JobFactory.java

mattmann Mon, 08 Jun 2015 23:02:17 -0700

Author: mattmann
Date: Tue Jun  9 06:01:46 2015
New Revision: 1684316

URL: http://svn.apache.org/r1684316
Log:
Fix for NUTCH-2037: Job endpoint to support Indexing from the REST API 
contributed by Sujen Shah <[email protected]> this closes #29


Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
    nutch/trunk/src/java/org/apache/nutch/service/impl/JobFactory.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1684316&r1=1684315&r2=1684316&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jun  9 06:01:46 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2037 Job endpoint to support Indexing from the REST API (Sujen Shah 
via mattmann)
+
 * NUTCH-2017 Remove debug log from MimeUtil (snagel)
 
 * NUTCH-2027 seed list REST endpoint for Nutch 1.10 (Asitang Mishra via 
mattmann)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1684316&r1=1684315&r2=1684316&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java Tue Jun  9 
06:01:46 2015
@@ -16,16 +16,20 @@
  */
 package org.apache.nutch.indexer;
 
+import java.io.File;
 import java.io.IOException;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Random;
 
 import org.apache.nutch.segment.SegmentChecker;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -40,6 +44,7 @@ import org.apache.hadoop.util.ToolRunner
 import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TimingUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -48,7 +53,7 @@ import org.slf4j.LoggerFactory;
  * Generic indexer which relies on the plugins implementing IndexWriter
  **/
 
-public class IndexingJob extends Configured implements Tool {
+public class IndexingJob extends NutchTool implements Tool {
 
   public static Logger LOG = LoggerFactory.getLogger(IndexingJob.class);
 
@@ -197,4 +202,88 @@ public class IndexingJob extends Configu
         new IndexingJob(), args);
     System.exit(res);
   }
+
+
+  //Used for REST API
+  @Override
+  public Map<String, Object> run(Map<String, String> args, String crawlId) 
throws Exception {
+    boolean noCommit = false;
+    boolean deleteGone = false; 
+    boolean filter = false;
+    boolean normalize = false;
+    boolean isSegment = false;
+    String params= null;
+    Configuration conf = getConf();
+
+    String crawldb = crawlId+"/crawldb";
+    Path crawlDb = new Path(crawldb);
+    Path linkDb = null;
+    List<Path> segments = new ArrayList<Path>();
+
+    if(args.containsKey("linkdb")){
+      linkDb = new Path(crawlId+"/linkdb");
+    }
+
+    if(args.containsKey("dir")){
+      isSegment = true;
+      Path dir = new Path(crawlId+"/segments");
+      FileSystem fs = dir.getFileSystem(getConf());
+      FileStatus[] fstats = fs.listStatus(dir,
+          HadoopFSUtil.getPassDirectoriesFilter(fs));
+      Path[] files = HadoopFSUtil.getPaths(fstats);
+      for (Path p : files) {
+        if (SegmentChecker.isIndexable(p,fs)) {
+          segments.add(p);
+        }
+      }     
+    }
+  
+    if(args.containsKey("segments")){
+      isSegment = true;
+      String listOfSegments[] = args.get("segments").split(",");
+      for(String s: listOfSegments){
+        segments.add(new Path(s));
+      }
+    }
+    
+    if(!isSegment){
+      String segment_dir = crawlId+"/segments";
+      File segmentsDir = new File(segment_dir);
+      File[] segmentsList = segmentsDir.listFiles();  
+      Arrays.sort(segmentsList, new Comparator<File>(){
+        @Override
+        public int compare(File f1, File f2) {
+          if(f1.lastModified()>f2.lastModified())
+            return -1;
+          else
+            return 0;
+        }      
+      });
+
+      Path segment = new Path(segmentsList[0].getPath());
+      segments.add(segment);
+    }
+    
+    if(args.containsKey("noCommit")){
+      noCommit = true;
+    }
+    if(args.containsKey("deleteGone")){
+      deleteGone = true;
+    }
+    if(args.containsKey("normalize")){
+      normalize = true;
+    }
+    if(args.containsKey("filter")){
+      filter = true;
+    }
+    if(args.containsKey("params")){
+      params = args.get("params");
+    }
+    setConf(conf);
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter,
+        normalize);
+    Map<String, Object> results = new HashMap<String, Object>();
+    results.put("result", 0);
+    return results;
+  }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/service/impl/JobFactory.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/impl/JobFactory.java?rev=1684316&r1=1684315&r2=1684316&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/impl/JobFactory.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/service/impl/JobFactory.java Tue Jun  
9 06:01:46 2015
@@ -28,6 +28,7 @@ import org.apache.nutch.crawl.Generator;
 import org.apache.nutch.crawl.Injector;
 import org.apache.nutch.crawl.LinkDb;
 import org.apache.nutch.fetcher.Fetcher;
+import org.apache.nutch.indexer.IndexingJob;
 import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.util.NutchTool;
 
@@ -42,6 +43,7 @@ public class JobFactory {
     typeToClass.put(JobType.GENERATE, Generator.class);
     typeToClass.put(JobType.FETCH, Fetcher.class);
     typeToClass.put(JobType.PARSE, ParseSegment.class);
+    typeToClass.put(JobType.INDEX, IndexingJob.class);
     typeToClass.put(JobType.UPDATEDB, CrawlDb.class);
     typeToClass.put(JobType.INVERTLINKS, LinkDb.class);
     typeToClass.put(JobType.DEDUP, DeduplicationJob.class);

svn commit: r1684316 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingJob.java src/java/org/apache/nutch/service/impl/JobFactory.java

Reply via email to