This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 34236ffecf478a1776559b0ed8c1ad929483d752 Author: Madhav Sharan <[email protected]> AuthorDate: Wed Mar 29 18:07:07 2017 -0400 fix for NUTCH-2370 contributed by [email protected] --- src/java/org/apache/nutch/tools/FileDumper.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/tools/FileDumper.java b/src/java/org/apache/nutch/tools/FileDumper.java index 53e6be4..51cc124 100644 --- a/src/java/org/apache/nutch/tools/FileDumper.java +++ b/src/java/org/apache/nutch/tools/FileDumper.java @@ -57,6 +57,7 @@ import org.apache.tika.Tika; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.codehaus.jackson.map.ObjectMapper; /** * The file dumper tool enables one to reverse generate the raw content from * Nutch segment data directories. @@ -154,6 +155,7 @@ public class FileDumper { for (File segment : segmentDirs) { LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]"); DataOutputStream doutputStream = null; + Map<String, String> filenameToUrl = new HashMap<String, String>(); File segmentDir = new File(segment.getAbsolutePath(), Content.DIR_NAME); File[] partDirs = segmentDir.listFiles(file -> file.canRead() && file.isDirectory()); @@ -242,7 +244,7 @@ public class FileDumper { } else { outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension)); } - + filenameToUrl.put(outputFullPath, url); File outputFile = new File(outputFullPath); if (!outputFile.exists()) { @@ -284,6 +286,10 @@ public class FileDumper { } } } + //save filenameToUrl in a json file for each segment there is one mapping file + String filenameToUrlFilePath = String.format("%s/%s_filenameToUrl.json", outputDir.getAbsolutePath(), segment.getName() ); + new ObjectMapper().writeValue(new File(filenameToUrlFilePath), filenameToUrl); + } LOG.info("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts)); -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
