Author: joyce
Date: Thu Nov 12 18:53:05 2015
New Revision: 1714104
URL: http://svn.apache.org/viewvc?rev=1714104&view=rev
Log:
NUTCH-2165 - Fix FileDumper hard coded part-# folder
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1714104&r1=1714103&r2=1714104&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Nov 12 18:53:05 2015
@@ -3,6 +3,8 @@ Nutch Change Log
Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
Release Report: http://s.apache.org/nutch11
+* NUTCH-2165 FileDumper Util hard codes part-# folder name (joyce)
+
* NUTCH-2167 Backport TableUtil from 2.x for URL reversing (joyce)
* NUTCH-2160 Upgrade Selenium Java to 2.48.2 (lewismc, kwhitehall)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1714104&r1=1714103&r2=1714104&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Thu Nov 12
18:53:05 2015
@@ -162,107 +162,118 @@ public class FileDumper {
for (File segment : segmentDirs) {
LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
DataOutputStream doutputStream = null;
- try {
- String segmentPath = segment.getAbsolutePath() + "/" + Content.DIR_NAME
- + "/part-00000/data";
- Path file = new Path(segmentPath);
- if (!new File(file.toString()).exists()) {
- LOG.warn("Skipping segment: [" + segmentPath
- + "]: no data directory present");
- continue;
- }
- SequenceFile.Reader reader = new SequenceFile.Reader(conf,
SequenceFile.Reader.file(file));
- Writable key = (Writable) reader.getKeyClass().newInstance();
- Content content = null;
+ File segmentDir = new File(segment.getAbsolutePath(), Content.DIR_NAME);
+ File[] partDirs = segmentDir.listFiles(new FileFilter() {
+ @Override
+ public boolean accept(File file) {
+ return file.canRead() && file.isDirectory();
+ }
+ });
- while (reader.next(key)) {
- content = new Content();
- reader.getCurrentValue(content);
- String url = key.toString();
- String baseName = FilenameUtils.getBaseName(url);
- String extension = FilenameUtils.getExtension(url);
- if (extension == null || (extension != null &&
extension.equals(""))) {
- extension = "html";
+ for (File partDir : partDirs) {
+ try {
+ String segmentPath = partDir + "/data";
+ Path file = new Path(segmentPath);
+ if (!new File(file.toString()).exists()) {
+ LOG.warn("Skipping segment: [" + segmentPath
+ + "]: no data directory present");
+ continue;
}
- String filename = baseName + "." + extension;
- ByteArrayInputStream bas = null;
- Boolean filter = false;
- try {
- bas = new ByteArrayInputStream(content.getContent());
- String mimeType = new Tika().detect(content.getContent());
- collectStats(typeCounts, mimeType);
- if (mimeType != null) {
- if (mimeTypes == null
- || Arrays.asList(mimeTypes).contains(mimeType)) {
- collectStats(filteredCounts, mimeType);
- filter = true;
- }
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf,
SequenceFile.Reader.file(file));
+
+ Writable key = (Writable) reader.getKeyClass().newInstance();
+ Content content = null;
+
+ while (reader.next(key)) {
+ content = new Content();
+ reader.getCurrentValue(content);
+ String url = key.toString();
+ String baseName = FilenameUtils.getBaseName(url);
+ String extension = FilenameUtils.getExtension(url);
+ if (extension == null || (extension != null &&
extension.equals(""))) {
+ extension = "html";
}
- } catch (Exception e) {
- e.printStackTrace();
- LOG.warn("Tika is unable to detect type for: [" + url + "]");
- } finally {
- if (bas != null) {
- try {
- bas.close();
- } catch (Exception ignore) {
+
+ String filename = baseName + "." + extension;
+ ByteArrayInputStream bas = null;
+ Boolean filter = false;
+ try {
+ bas = new ByteArrayInputStream(content.getContent());
+ String mimeType = new Tika().detect(content.getContent());
+ collectStats(typeCounts, mimeType);
+ if (mimeType != null) {
+ if (mimeTypes == null
+ || Arrays.asList(mimeTypes).contains(mimeType)) {
+ collectStats(filteredCounts, mimeType);
+ filter = true;
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ LOG.warn("Tika is unable to detect type for: [" + url + "]");
+ } finally {
+ if (bas != null) {
+ try {
+ bas.close();
+ } catch (Exception ignore) {
+ }
}
}
- }
- if (filter) {
- if (!mimeTypeStats) {
- String md5Ofurl = DumpFileUtil.getUrlMD5(url);
-
- String fullDir = outputDir.getAbsolutePath();
- if (!flatDir) {
- fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir,
md5Ofurl);
- }
+ if (filter) {
+ if (!mimeTypeStats) {
+ String md5Ofurl = DumpFileUtil.getUrlMD5(url);
+
+ String fullDir = outputDir.getAbsolutePath();
+ if (!flatDir) {
+ fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir,
md5Ofurl);
+ }
- if (!Strings.isNullOrEmpty(fullDir)) {
- String outputFullPath = String.format("%s/%s", fullDir,
DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
- File outputFile = new File(outputFullPath);
-
- if (!outputFile.exists()) {
- LOG.info("Writing: [" + outputFullPath + "]");
-
- // Modified to prevent FileNotFoundException (Invalid
Argument)
- FileOutputStream output = null;
- try {
- output = new FileOutputStream(outputFile);
- IOUtils.write(content.getContent(), output);
- }
- catch (Exception e) {
- LOG.warn("Write Error: [" + outputFullPath + "]");
- e.printStackTrace();
- }
- finally {
- if (output != null) {
- output.flush();
- try {
- output.close();
- } catch (Exception ignore) {
+ if (!Strings.isNullOrEmpty(fullDir)) {
+ String outputFullPath = String.format("%s/%s", fullDir,
DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
+ File outputFile = new File(outputFullPath);
+
+ if (!outputFile.exists()) {
+ LOG.info("Writing: [" + outputFullPath + "]");
+
+ // Modified to prevent FileNotFoundException (Invalid
Argument)
+ FileOutputStream output = null;
+ try {
+ output = new FileOutputStream(outputFile);
+ IOUtils.write(content.getContent(), output);
+ }
+ catch (Exception e) {
+ LOG.warn("Write Error: [" + outputFullPath + "]");
+ e.printStackTrace();
+ }
+ finally {
+ if (output != null) {
+ output.flush();
+ try {
+ output.close();
+ } catch (Exception ignore) {
+ }
}
}
+ fileCount++;
+ } else {
+ LOG.info("Skipping writing: [" + outputFullPath
+ + "]: file already exists");
}
- fileCount++;
- } else {
- LOG.info("Skipping writing: [" + outputFullPath
- + "]: file already exists");
}
}
}
}
- }
- reader.close();
- } finally {
- fs.close();
- if (doutputStream != null) {
- try {
- doutputStream.close();
- } catch (Exception ignore) {
+ reader.close();
+ } finally {
+ fs.close();
+ if (doutputStream != null) {
+ try {
+ doutputStream.close();
+ } catch (Exception ignore) {
+ }
}
}
}