This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 7da8c704d1ea6dc892e32c8a9f9678affd1c7085 Author: Marcos Bori <[email protected]> AuthorDate: Wed Sep 27 13:10:24 2017 +0200 NUTCH-2435 - New parameter "parser.store.text" allowing to choose whether to store 'parse_text' directory or not. --- conf/nutch-default.xml | 7 +++++++ .../org/apache/nutch/parse/ParseOutputFormat.java | 24 +++++++++++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 6ddf964..ed0bb98 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1379,6 +1379,13 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this </description> </property> +<property> + <name>parser.store.text</name> + <value>true</value> + <description>If true (default value), parser will store parse text (parse_text directory within the segment).</description> +</property> + + <!-- <property> <name>tika.htmlmapper.classname</name> diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java b/src/java/org/apache/nutch/parse/ParseOutputFormat.java index 6e84b12..b0778f3 100644 --- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java +++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java @@ -111,6 +111,9 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> { "db.ignore.external.links", false); final String ignoreExternalLinksMode = job.get( "db.ignore.external.links.mode", "byHost"); + //NUTCH-2435 - parameter "parser.store.text" allowing to choose whether to store 'parse_text' directory or not: + final boolean storeText = job.getBoolean( + "parser.store.text", true); int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100); final boolean isParsing = job.getBoolean("fetcher.parse", true); @@ -128,13 +131,18 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> { .split(" *, *"); // textOut Options - Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class); - org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class); - org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable(progress); - org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD); + final MapFile.Writer textOut; + if (storeText) { + Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class); + org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class); + org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable(progress); + org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD); - final MapFile.Writer textOut = new MapFile.Writer(job, text, + textOut = new MapFile.Writer(job, text, tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt); + } else { + textOut=null; + } // dataOut Options Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class); @@ -162,7 +170,9 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> { String fromUrl = key.toString(); // host or domain name of the source URL String origin = null; - textOut.append(key, new ParseText(parse.getText())); + if (textOut!=null) { + textOut.append(key, new ParseText(parse.getText())); + } ParseData parseData = parse.getData(); // recover the signature prepared by Fetcher or ParseSegment @@ -311,7 +321,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> { } public void close(Reporter reporter) throws IOException { - textOut.close(); + if (textOut!=null) textOut.close(); dataOut.close(); crawlOut.close(); } -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
