Repository: nutch Updated Branches: refs/heads/master 9f32fe84a -> d27c351f4
Fix for Nutch-2246: Refactor /seed end point, this closes #137 Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/d27c351f Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/d27c351f Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/d27c351f Branch: refs/heads/master Commit: d27c351f440f5a5932049232760d492585078a54 Parents: 9f32fe8 Author: Sujen Shah <sujen1...@gmail.com> Authored: Mon Aug 1 11:46:39 2016 -0400 Committer: Sujen Shah <su...@apache.org> Committed: Tue Aug 9 14:47:07 2016 -0700 ---------------------------------------------------------------------- CHANGES.txt | 1 + src/java/org/apache/nutch/crawl/Injector.java | 33 +++++++---- src/java/org/apache/nutch/metadata/Nutch.java | 2 + .../org/apache/nutch/service/NutchServer.java | 7 +++ .../org/apache/nutch/service/SeedManager.java | 33 +++++++++++ .../nutch/service/impl/SeedManagerImpl.java | 58 ++++++++++++++++++++ .../nutch/service/model/request/SeedList.java | 10 ++++ .../nutch/service/resources/SeedResource.java | 27 ++++++++- 8 files changed, 157 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/d27c351f/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index ffcf5ae..2e0e041 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -69,6 +69,7 @@ New Feature Task + [NUTCH-2246] - Refactor /seed endpoint for backward compatibility [NUTCH-2201] - Remove loops program from webgraph package [NUTCH-2211] - Filter and normalizer checkers missing in bin/nutch [NUTCH-2220] - Rename db.* options used only by the linkdb to linkdb.* http://git-wip-us.apache.org/repos/asf/nutch/blob/d27c351f/src/java/org/apache/nutch/crawl/Injector.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 383aaf1..6575782 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -41,6 +41,7 @@ import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.LockUtil; +import org.apache.nutch.service.NutchServer; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchTool; import org.apache.nutch.util.TimingUtil; @@ -477,11 +478,28 @@ public class Injector extends NutchTool implements Tool { */ public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception { - if (args.size() < 1) { - throw new IllegalArgumentException("Required arguments <url_dir>"); + if(args.size()<1){ + throw new IllegalArgumentException("Required arguments <url_dir> or <seedName>"); + } + Path input; + Object path = null; + if(args.containsKey(Nutch.ARG_SEEDDIR)) { + path = args.get(Nutch.ARG_SEEDDIR); + } + else if(args.containsKey(Nutch.ARG_SEEDNAME)) { + path = NutchServer.getInstance().getSeedManager(). + getSeedList((String)args.get(Nutch.ARG_SEEDNAME)).getSeedFilePath(); + } + else { + throw new IllegalArgumentException("Required arguments <url_dir> or <seedName>"); + } + if(path instanceof Path) { + input = (Path) path; + } + else { + input = new Path(path.toString()); } Map<String, Object> results = new HashMap<String, Object>(); - Path crawlDb; if (args.containsKey(Nutch.ARG_CRAWLDB)) { Object crawldbPath = args.get(Nutch.ARG_CRAWLDB); @@ -493,15 +511,6 @@ public class Injector extends NutchTool implements Tool { } else { crawlDb = new Path(crawlId + "/crawldb"); } - - Path input; - Object path = args.get(Nutch.ARG_SEEDDIR); - if (path instanceof Path) { - input = (Path) path; - } else { - input = new Path(path.toString()); - } - inject(crawlDb, input); results.put(Nutch.VAL_RESULT, Integer.toString(0)); return results; http://git-wip-us.apache.org/repos/asf/nutch/blob/d27c351f/src/java/org/apache/nutch/metadata/Nutch.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java index de80399..cbc3317 100644 --- a/src/java/org/apache/nutch/metadata/Nutch.java +++ b/src/java/org/apache/nutch/metadata/Nutch.java @@ -84,6 +84,8 @@ public interface Nutch { public static final String CRAWL_ID_KEY = "storage.crawl.id"; /** Argument key to specify location of the seed url dir for the REST endpoints **/ public static final String ARG_SEEDDIR = "url_dir"; + /** Argument key to specify name of a seed list for the REST endpoints **/ + public static final String ARG_SEEDNAME = "seedName"; /** Argument key to specify the location of crawldb for the REST endpoints **/ public static final String ARG_CRAWLDB = "crawldb"; /** Argument key to specify the location of linkdb for the REST endpoints **/ http://git-wip-us.apache.org/repos/asf/nutch/blob/d27c351f/src/java/org/apache/nutch/service/NutchServer.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/service/NutchServer.java b/src/java/org/apache/nutch/service/NutchServer.java index e206707..6d531e0 100644 --- a/src/java/org/apache/nutch/service/NutchServer.java +++ b/src/java/org/apache/nutch/service/NutchServer.java @@ -41,6 +41,7 @@ import org.apache.nutch.fetcher.FetchNodeDb; import org.apache.nutch.service.impl.ConfManagerImpl; import org.apache.nutch.service.impl.JobFactory; import org.apache.nutch.service.impl.JobManagerImpl; +import org.apache.nutch.service.impl.SeedManagerImpl; import org.apache.nutch.service.impl.NutchServerPoolExecutor; import org.apache.nutch.service.model.response.JobInfo; import org.apache.nutch.service.model.response.JobInfo.State; @@ -74,6 +75,7 @@ public class NutchServer { private boolean running; private ConfManager configManager; private JobManager jobManager; + private SeedManager seedManager; private JAXRSServerFactoryBean sf; private static FetchNodeDb fetchNodeDb; @@ -86,6 +88,7 @@ public class NutchServer { private NutchServer() { configManager = new ConfManagerImpl(); + seedManager = new SeedManagerImpl(); BlockingQueue<Runnable> runnables = Queues.newArrayBlockingQueue(JOB_CAPACITY); NutchServerPoolExecutor executor = new NutchServerPoolExecutor(10, JOB_CAPACITY, 1, TimeUnit.HOURS, runnables); jobManager = new JobManagerImpl(new JobFactory(), configManager, executor); @@ -149,6 +152,10 @@ public class NutchServer { public JobManager getJobManager() { return jobManager; } + + public SeedManager getSeedManager() { + return seedManager; + } public FetchNodeDb getFetchNodeDb(){ return fetchNodeDb; http://git-wip-us.apache.org/repos/asf/nutch/blob/d27c351f/src/java/org/apache/nutch/service/SeedManager.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/service/SeedManager.java b/src/java/org/apache/nutch/service/SeedManager.java new file mode 100644 index 0000000..a96c4ac --- /dev/null +++ b/src/java/org/apache/nutch/service/SeedManager.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.service; + +import java.util.Map; + +import org.apache.nutch.service.model.request.SeedList; + +public interface SeedManager { + + public SeedList getSeedList(String seedName); + + public void setSeedList(String seedName, SeedList seedList); + + public boolean deleteSeedList(String seedName); + + public Map<String, SeedList> getSeeds(); +} http://git-wip-us.apache.org/repos/asf/nutch/blob/d27c351f/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java b/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java new file mode 100644 index 0000000..c7b7607 --- /dev/null +++ b/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.service.impl; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.nutch.service.SeedManager; +import org.apache.nutch.service.model.request.SeedList; + +public class SeedManagerImpl implements SeedManager { + + private static Map<String, SeedList> seeds; + + public SeedManagerImpl() { + seeds = new HashMap<>(); + } + + public SeedList getSeedList(String seedName) { + if(seeds.containsKey(seedName)) { + return seeds.get(seedName); + } + else + return null; + } + + public void setSeedList(String seedName, SeedList seedList) { + seeds.put(seedName, seedList); + } + + public Map<String, SeedList> getSeeds(){ + return seeds; + } + + public boolean deleteSeedList(String seedName) { + if(seeds.containsKey(seedName)) { + seeds.remove(seedName); + return true; + } + else + return false; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/d27c351f/src/java/org/apache/nutch/service/model/request/SeedList.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/service/model/request/SeedList.java b/src/java/org/apache/nutch/service/model/request/SeedList.java index bbb3e2a..5ba60da 100644 --- a/src/java/org/apache/nutch/service/model/request/SeedList.java +++ b/src/java/org/apache/nutch/service/model/request/SeedList.java @@ -29,6 +29,8 @@ public class SeedList implements Serializable { private Long id; private String name; + private String seedFilePath; + @JsonManagedReference private Collection<SeedUrl> seedUrls; @@ -57,6 +59,14 @@ public class SeedList implements Serializable { this.name = name; } + public String getSeedFilePath() { + return seedFilePath; + } + + public void setSeedFilePath(String seedFilePath) { + this.seedFilePath = seedFilePath; + } + @JsonIgnore public int getSeedUrlsCount() { if (CollectionUtils.isEmpty(seedUrls)) { http://git-wip-us.apache.org/repos/asf/nutch/blob/d27c351f/src/java/org/apache/nutch/service/resources/SeedResource.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/service/resources/SeedResource.java b/src/java/org/apache/nutch/service/resources/SeedResource.java index 5261139..638af33 100644 --- a/src/java/org/apache/nutch/service/resources/SeedResource.java +++ b/src/java/org/apache/nutch/service/resources/SeedResource.java @@ -24,8 +24,10 @@ import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.util.Collection; +import java.util.Map; import javax.ws.rs.Consumes; +import javax.ws.rs.GET; import javax.ws.rs.POST; import javax.ws.rs.Path; import javax.ws.rs.Produces; @@ -35,6 +37,7 @@ import javax.ws.rs.core.Response; import javax.ws.rs.core.Response.Status; import org.apache.commons.collections.CollectionUtils; +import org.apache.nutch.service.NutchServer; import org.apache.nutch.service.model.request.SeedList; import org.apache.nutch.service.model.request.SeedUrl; import org.slf4j.Logger; @@ -48,6 +51,23 @@ public class SeedResource extends AbstractResource { .getLogger(AdminResource.class); /** + * Gets the list of seedFiles already created + * @return + */ + @GET + @Path("/") + @Produces(MediaType.APPLICATION_JSON) + public Response getSeedLists() { + Map<String, SeedList> seeds = NutchServer.getInstance().getSeedManager().getSeeds(); + if(seeds!=null) { + return Response.ok(seeds).build(); + } + else { + return Response.ok().build(); + } + } + + /** * Method creates seed list file and returns temporary directory path * @param seedList * @return @@ -70,8 +90,11 @@ public class SeedResource extends AbstractResource { writeUrl(writer, seedUrl); } } - - return Response.ok().entity(seedFile.getParent()).build(); + String seedFilePath = seedFile.getParent(); + seedList.setSeedFilePath(seedFilePath); + NutchServer.getInstance().getSeedManager(). + setSeedList(seedList.getName(), seedList); + return Response.ok().entity(seedFilePath).build(); } private void writeUrl(BufferedWriter writer, SeedUrl seedUrl) {