Repository: nutch Updated Branches: refs/heads/master 71f0471b8 -> dce7a28c7
Fix compilation errors for CommonCrawlDataDumper Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/dce7a28c Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/dce7a28c Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/dce7a28c Branch: refs/heads/master Commit: dce7a28c70297208c0d86a572f4c232120e72018 Parents: 71f0471 Author: Lewis John McGibbney <[email protected]> Authored: Mon May 9 13:55:53 2016 -0700 Committer: Lewis John McGibbney <[email protected]> Committed: Mon May 9 13:55:53 2016 -0700 ---------------------------------------------------------------------- .../nutch/tools/CommonCrawlDataDumper.java | 13 +- .../nutch/tools/TestCommonCrawlDataDumper.java | 150 +++++++++---------- 2 files changed, 87 insertions(+), 76 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/dce7a28c/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java index 5abd393..b4fc0a7 100644 --- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java +++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java @@ -203,7 +203,6 @@ public class CommonCrawlDataDumper extends Configured implements Tool { * the gzip option may be provided. * @throws Exception */ - @SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Configuration conf = NutchConfiguration.create(); int res = ToolRunner.run(conf, new CommonCrawlDataDumper(), args); @@ -573,6 +572,7 @@ public class CommonCrawlDataDumper extends Configured implements Tool { public int run(String[] args) throws Exception { Option helpOpt = new Option("h", "help", false, "show this help message."); // argument options + @SuppressWarnings("static-access") Option outputOpt = OptionBuilder.withArgName("outputDir").hasArg() .withDescription( "output directory (which will be created) to host the CBOR data.") @@ -580,41 +580,52 @@ public class CommonCrawlDataDumper extends Configured implements Tool { // WARC format Option warcOpt = new Option("warc", "export to a WARC file"); + @SuppressWarnings("static-access") Option segOpt = OptionBuilder.withArgName("segment").hasArgs() .withDescription("the segment or directory containing segments to use").create("segment"); // create mimetype and gzip options + @SuppressWarnings("static-access") Option mimeOpt = OptionBuilder.isRequired(false).withArgName("mimetype") .hasArgs().withDescription( "an optional list of mimetypes to dump, excluding all others. Defaults to all.") .create("mimetype"); + @SuppressWarnings("static-access") Option gzipOpt = OptionBuilder.withArgName("gzip").hasArg(false) .withDescription( "an optional flag indicating whether to additionally gzip the data.") .create("gzip"); + @SuppressWarnings("static-access") Option keyPrefixOpt = OptionBuilder.withArgName("keyPrefix").hasArg(true) .withDescription("an optional prefix for key in the output format.") .create("keyPrefix"); + @SuppressWarnings("static-access") Option simpleDateFormatOpt = OptionBuilder.withArgName("SimpleDateFormat") .hasArg(false).withDescription( "an optional format for timestamp in GMT epoch milliseconds.") .create("SimpleDateFormat"); + @SuppressWarnings("static-access") Option epochFilenameOpt = OptionBuilder.withArgName("epochFilename") .hasArg(false) .withDescription("an optional format for output filename.") .create("epochFilename"); + @SuppressWarnings("static-access") Option jsonArrayOpt = OptionBuilder.withArgName("jsonArray").hasArg(false) .withDescription("an optional format for JSON output.") .create("jsonArray"); + @SuppressWarnings("static-access") Option reverseKeyOpt = OptionBuilder.withArgName("reverseKey").hasArg(false) .withDescription("an optional format for key value in JSON output.") .create("reverseKey"); + @SuppressWarnings("static-access") Option extensionOpt = OptionBuilder.withArgName("extension").hasArg(true) .withDescription("an optional file extension for output documents.") .create("extension"); + @SuppressWarnings("static-access") Option sizeOpt = OptionBuilder.withArgName("warcSize").hasArg(true) .withType(Number.class) .withDescription("an optional file size in bytes for the WARC file(s)") .create("warcSize"); + @SuppressWarnings("static-access") Option linkDbOpt = OptionBuilder.withArgName("linkdb").hasArg(true) .withDescription("an optional linkdb parameter to include inlinks in dump files") .isRequired(false) http://git-wip-us.apache.org/repos/asf/nutch/blob/dce7a28c/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java b/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java index f635d67..1429925 100644 --- a/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java +++ b/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java @@ -41,85 +41,85 @@ import org.apache.nutch.tools.CommonCrawlConfig; */ public class TestCommonCrawlDataDumper { - @Test - public void testDump() throws Exception { - File sampleSegmentDir = new File(System.getProperty("test.build.data", - "."), "test-segments"); - File tempDir = Files.createTempDirectory("temp").toFile(); + @Test + public void testDump() throws Exception { + File sampleSegmentDir = new File(System.getProperty("test.build.data", + "."), "test-segments"); + File tempDir = Files.createTempDirectory("temp").toFile(); - String[] crawledFiles = { - "c463a4381eb837f9f5d45978cfbde79e_.html", - "a974b8d74f7779ab6c6f90b9b279467e_.html", - "6bc6497314656a3129732efd708e9f96_.html", - "6e88c40abe26cad0a726102997aed048_.html", - "5cafdd88f4e9cf3f0cd4c298c6873358_apachecon-europe.html", - "932dc10a76e894a2baa8ea4086ad72a8_apachecon-north-america.html", - "8540187d75b9cd405b8fa97d665f9f90_.html", - "e501bc976c8693b4d28a55b79c390a32_.html", - "6add662f9f5758b7d75eec5cfa1f340b_.html", - "d4f20df3c37033dc516067ee1f424e4e_.html", - "d7b8fa9a02cdc95546030d04be4a98f3_solr.html", - "3cbe876e3a8e7a397811de3bb6a945cd_.html", - "5b987dde0da79d7f2e3f22b46437f514_bot.html", - "3d742820d9a701a1f02e10d5bf5ae633_credits.html", - "693673f3c73d04a26276effdea69b7ee_downloads.html", - "4f7e3469dafabb4c3b87b00531f81aa4_index.html", - "15c5330675be8a69995aab18ff9859e0_javadoc.html", - "bc624e1b49e29870ef095819bb0e977a_mailing_lists.html", - "a7d66b68754c3665c66e62225255e3fd_version_control.html", - "32fb7fe362e1a0d8a1b15addf2a00bdc_1.9-rel", - "54ab3db10fe7b26415a04e21045125a8_1zE.html", - "1012a41c08092c40340598bd8ee0bfa6_PGa.html", - "c830cfc5c28bed10e69d5b83e9c1bcdc_nutch_2.3", - "687d915dc264a77f35c61ba841936730_oHY.html", - "2bf1afb650010128b4cf4afe677db3c5_1pav9xl.html", - "550cab79e14110bbee61c36c61c830b0_1pbE15n.html", - "664ff07b46520cc1414494ae49da91f6_.html", - "04223714e648a6a43d7c8af8b095f733_.html", - "3c8ccb865cd72cca06635d74c7f2f3c4_.html", - "90fe47b28716a2230c5122c83f0b8562_Becoming_A_Nutch_Developer.html", - "ac0fefe70007d40644e2b8bd5da3c305_FAQ.html", - "bc9bc7f11c1262e8924032ab1c7ce112_NutchPropertiesCompleteList.html", - "78d04611985e7375b441e478fa36f610_.html", - "64adaebadd44e487a8b58894e979dc70_CHANGES.txt", - "a48e9c2659b703fdea3ad332877708d8_.html", - "159d66d679dd4442d2d8ffe6a83b2912_sponsorship.html", - "66f1ce6872c9195c665fc8bdde95f6dc_thanks.html", - "ef7ee7e929a048c4a119af78492095b3_.html", - "e4251896a982c2b2b68678b5c9c57f4d_.html", - "5384764a16fab767ebcbc17d87758a24_.html", - "a6ba75a218ef2a09d189cb7dffcecc0f_.html", - "f2fa63bd7a3aca63841eed4cd10fb519_SolrCloud.html", - "f8de0fbda874e1a140f1b07dcebab374_NUTCH-1047.html", - "9c120e94f52d690e9cfd044c34134649_NUTCH-1591.html", - "7dd70378379aa452279ce9200d0a5fed_NUTCH-841.html", - "ddf78b1fe5c268d59fd62bc745815b92_.html", - "401c9f04887dbbf8d29ad52841b8bdb3_ApacheNutch.html", - "8f984e2d3c2ba68d1695288f1738deaf_Nutch.html", - "c2ef09a95a956207cea073a515172be2_FrontPage.html", - "90d9b76e8eabdab1cbcc29bea437c7ae_NutchRESTAPI.html" }; + String[] crawledFiles = { + "c463a4381eb837f9f5d45978cfbde79e_.html", + "a974b8d74f7779ab6c6f90b9b279467e_.html", + "6bc6497314656a3129732efd708e9f96_.html", + "6e88c40abe26cad0a726102997aed048_.html", + "5cafdd88f4e9cf3f0cd4c298c6873358_apachecon-europe.html", + "932dc10a76e894a2baa8ea4086ad72a8_apachecon-north-america.html", + "8540187d75b9cd405b8fa97d665f9f90_.html", + "e501bc976c8693b4d28a55b79c390a32_.html", + "6add662f9f5758b7d75eec5cfa1f340b_.html", + "d4f20df3c37033dc516067ee1f424e4e_.html", + "d7b8fa9a02cdc95546030d04be4a98f3_solr.html", + "3cbe876e3a8e7a397811de3bb6a945cd_.html", + "5b987dde0da79d7f2e3f22b46437f514_bot.html", + "3d742820d9a701a1f02e10d5bf5ae633_credits.html", + "693673f3c73d04a26276effdea69b7ee_downloads.html", + "4f7e3469dafabb4c3b87b00531f81aa4_index.html", + "15c5330675be8a69995aab18ff9859e0_javadoc.html", + "bc624e1b49e29870ef095819bb0e977a_mailing_lists.html", + "a7d66b68754c3665c66e62225255e3fd_version_control.html", + "32fb7fe362e1a0d8a1b15addf2a00bdc_1.9-rel", + "54ab3db10fe7b26415a04e21045125a8_1zE.html", + "1012a41c08092c40340598bd8ee0bfa6_PGa.html", + "c830cfc5c28bed10e69d5b83e9c1bcdc_nutch_2.3", + "687d915dc264a77f35c61ba841936730_oHY.html", + "2bf1afb650010128b4cf4afe677db3c5_1pav9xl.html", + "550cab79e14110bbee61c36c61c830b0_1pbE15n.html", + "664ff07b46520cc1414494ae49da91f6_.html", + "04223714e648a6a43d7c8af8b095f733_.html", + "3c8ccb865cd72cca06635d74c7f2f3c4_.html", + "90fe47b28716a2230c5122c83f0b8562_Becoming_A_Nutch_Developer.html", + "ac0fefe70007d40644e2b8bd5da3c305_FAQ.html", + "bc9bc7f11c1262e8924032ab1c7ce112_NutchPropertiesCompleteList.html", + "78d04611985e7375b441e478fa36f610_.html", + "64adaebadd44e487a8b58894e979dc70_CHANGES.txt", + "a48e9c2659b703fdea3ad332877708d8_.html", + "159d66d679dd4442d2d8ffe6a83b2912_sponsorship.html", + "66f1ce6872c9195c665fc8bdde95f6dc_thanks.html", + "ef7ee7e929a048c4a119af78492095b3_.html", + "e4251896a982c2b2b68678b5c9c57f4d_.html", + "5384764a16fab767ebcbc17d87758a24_.html", + "a6ba75a218ef2a09d189cb7dffcecc0f_.html", + "f2fa63bd7a3aca63841eed4cd10fb519_SolrCloud.html", + "f8de0fbda874e1a140f1b07dcebab374_NUTCH-1047.html", + "9c120e94f52d690e9cfd044c34134649_NUTCH-1591.html", + "7dd70378379aa452279ce9200d0a5fed_NUTCH-841.html", + "ddf78b1fe5c268d59fd62bc745815b92_.html", + "401c9f04887dbbf8d29ad52841b8bdb3_ApacheNutch.html", + "8f984e2d3c2ba68d1695288f1738deaf_Nutch.html", + "c2ef09a95a956207cea073a515172be2_FrontPage.html", + "90d9b76e8eabdab1cbcc29bea437c7ae_NutchRESTAPI.html" }; - CommonCrawlDataDumper dumper = new CommonCrawlDataDumper( - new CommonCrawlConfig()); - dumper.dump(tempDir, sampleSegmentDir, false, null, false, "", false); + CommonCrawlDataDumper dumper = new CommonCrawlDataDumper( + new CommonCrawlConfig()); + dumper.dump(tempDir, sampleSegmentDir, null, false, null, false, "", false); - Collection<File> tempFiles = FileUtils.listFiles(tempDir, - FileFilterUtils.fileFileFilter(), - FileFilterUtils.directoryFileFilter()); + Collection<File> tempFiles = FileUtils.listFiles(tempDir, + FileFilterUtils.fileFileFilter(), + FileFilterUtils.directoryFileFilter()); - for (String expectedFileName : crawledFiles) { - assertTrue("Missed file " + expectedFileName + " in dump", - hasFile(expectedFileName, tempFiles)); - } + for (String expectedFileName : crawledFiles) { + assertTrue("Missed file " + expectedFileName + " in dump", + hasFile(expectedFileName, tempFiles)); + } - } + } - private boolean hasFile(String fileName, Collection<File> files) { - for (File f : files) { - if (f.getName().equals(fileName)) { - return true; - } - } - return false; - } + private boolean hasFile(String fileName, Collection<File> files) { + for (File f : files) { + if (f.getName().equals(fileName)) { + return true; + } + } + return false; + } }
