Repository: nutch Updated Branches: refs/heads/master 9092e233f -> 24cc2aa9c
Fix for NUTCH-2327: Seeds injected in REST must be ingested into HDFS, this closes #155 Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/24cc2aa9 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/24cc2aa9 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/24cc2aa9 Branch: refs/heads/master Commit: 24cc2aa9c68fa356e4e926b6bf86bac99d52e38c Parents: 9092e23 Author: Sujen Shah <[email protected]> Authored: Tue Oct 18 21:36:27 2016 -0700 Committer: Sujen Shah <[email protected]> Committed: Tue Oct 25 09:02:00 2016 -0700 ---------------------------------------------------------------------- .../nutch/service/resources/SeedResource.java | 75 +++++++------------- 1 file changed, 27 insertions(+), 48 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/24cc2aa9/src/java/org/apache/nutch/service/resources/SeedResource.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/service/resources/SeedResource.java b/src/java/org/apache/nutch/service/resources/SeedResource.java index 638af33..61a0526 100644 --- a/src/java/org/apache/nutch/service/resources/SeedResource.java +++ b/src/java/org/apache/nutch/service/resources/SeedResource.java @@ -16,13 +16,7 @@ */ package org.apache.nutch.service.resources; -import static javax.ws.rs.core.Response.status; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.IOException; +import java.io.OutputStream; import java.util.Collection; import java.util.Map; @@ -31,19 +25,19 @@ import javax.ws.rs.GET; import javax.ws.rs.POST; import javax.ws.rs.Path; import javax.ws.rs.Produces; -import javax.ws.rs.WebApplicationException; import javax.ws.rs.core.MediaType; import javax.ws.rs.core.Response; import javax.ws.rs.core.Response.Status; import org.apache.commons.collections.CollectionUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.nutch.service.NutchServer; import org.apache.nutch.service.model.request.SeedList; import org.apache.nutch.service.model.request.SeedUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.io.Files; @Path("/seed") public class SeedResource extends AbstractResource { @@ -77,58 +71,43 @@ public class SeedResource extends AbstractResource { @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.TEXT_PLAIN) public Response createSeedFile(SeedList seedList) { + try { if (seedList == null) { return Response.status(Status.BAD_REQUEST) .entity("Seed list cannot be empty!").build(); } - File seedFile = createSeedFile(); - BufferedWriter writer = getWriter(seedFile); - Collection<SeedUrl> seedUrls = seedList.getSeedUrls(); - if (CollectionUtils.isNotEmpty(seedUrls)) { - for (SeedUrl seedUrl : seedUrls) { - writeUrl(writer, seedUrl); - } - } - String seedFilePath = seedFile.getParent(); + + String seedFilePath = writeToSeedFile(seedUrls); seedList.setSeedFilePath(seedFilePath); NutchServer.getInstance().getSeedManager(). setSeedList(seedList.getName(), seedList); return Response.ok().entity(seedFilePath).build(); - } - - private void writeUrl(BufferedWriter writer, SeedUrl seedUrl) { - try { - writer.write(seedUrl.getUrl()); - writer.newLine(); - writer.flush(); - } catch (IOException e) { - throw handleException(e); + } catch (Exception e) { + log.warn("Error while creating seed : {}", e.getMessage()); } + return Response.serverError().build(); } - private BufferedWriter getWriter(File seedFile) { - try { - return new BufferedWriter(new FileWriter(seedFile)); - } catch (FileNotFoundException e) { - throw handleException(e); - } catch (IOException e) { - throw handleException(e); + private String writeToSeedFile(Collection<SeedUrl> seedUrls) throws Exception { + String seedFilePath = "seedFiles/seed-" + System.currentTimeMillis(); + org.apache.hadoop.fs.Path seedFolder = new org.apache.hadoop.fs.Path(seedFilePath); + FileSystem fs = FileSystem.get(new Configuration()); + if(!fs.exists(seedFolder)) { + if(!fs.mkdirs(seedFolder)) { + throw new Exception("Could not create seed folder at : " + seedFolder); + } } - } - - private File createSeedFile() { - try { - return File.createTempFile("seed", ".txt", Files.createTempDir()); - } catch (IOException e) { - throw handleException(e); + String filename = seedFilePath + System.getProperty("file.separator") + "urls"; + org.apache.hadoop.fs.Path seedPath = new org.apache.hadoop.fs.Path(filename); + OutputStream os = fs.create(seedPath); + if (CollectionUtils.isNotEmpty(seedUrls)) { + for (SeedUrl seedUrl : seedUrls) { + os.write(seedUrl.getUrl().getBytes()); + os.write("\n".getBytes()); + } } + os.close(); + return seedPath.getParent().toString(); } - - private RuntimeException handleException(Exception e) { - log.error("Cannot create seed file!", e); - return new WebApplicationException(status(Status.INTERNAL_SERVER_ERROR) - .entity("Cannot create seed file!").build()); - } - }
