[nutch] 01/03: NUTCH-2435 - New parameter "parser.store.text" allowing to choose whether to store 'parse_text' directory or not.

snagel Thu, 19 Oct 2017 14:28:30 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


commit 7da8c704d1ea6dc892e32c8a9f9678affd1c7085
Author: Marcos Bori <[email protected]>
AuthorDate: Wed Sep 27 13:10:24 2017 +0200

    NUTCH-2435 - New parameter "parser.store.text" allowing to choose whether 
to store 'parse_text' directory or not.
---
 conf/nutch-default.xml                             |  7 +++++++
 .../org/apache/nutch/parse/ParseOutputFormat.java  | 24 +++++++++++++++-------
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 6ddf964..ed0bb98 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1379,6 +1379,13 @@ CAUTION: Set the parser.timeout to -1 or a bigger value 
than 30, when using this
   </description>
 </property>
 
+<property>
+  <name>parser.store.text</name>
+  <value>true</value>
+  <description>If true (default value), parser will store parse text 
(parse_text directory within the segment).</description>
+</property>
+
+
 <!--
 <property>
   <name>tika.htmlmapper.classname</name>
diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
index 6e84b12..b0778f3 100644
--- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java
+++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -111,6 +111,9 @@ public class ParseOutputFormat implements 
OutputFormat<Text, Parse> {
         "db.ignore.external.links", false);
     final String ignoreExternalLinksMode = job.get(
         "db.ignore.external.links.mode", "byHost");
+    //NUTCH-2435 - parameter "parser.store.text" allowing to choose whether to 
store 'parse_text' directory or not:
+    final boolean storeText = job.getBoolean(
+        "parser.store.text", true);
     
     int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
     final boolean isParsing = job.getBoolean("fetcher.parse", true);
@@ -128,13 +131,18 @@ public class ParseOutputFormat implements 
OutputFormat<Text, Parse> {
         .split(" *, *");
 
     // textOut Options
-    Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
-    org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = 
SequenceFile.Writer.valueClass(ParseText.class);
-    org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = 
SequenceFile.Writer.progressable(progress);
-    org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = 
SequenceFile.Writer.compression(CompressionType.RECORD);
+    final MapFile.Writer textOut;
+    if (storeText) {
+      Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+      org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = 
SequenceFile.Writer.valueClass(ParseText.class);
+      org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = 
SequenceFile.Writer.progressable(progress);
+      org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = 
SequenceFile.Writer.compression(CompressionType.RECORD);
     
-    final MapFile.Writer textOut = new MapFile.Writer(job, text,
+      textOut = new MapFile.Writer(job, text,
         tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);
+    } else {
+      textOut=null;
+    }
     
     // dataOut Options
     Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
@@ -162,7 +170,9 @@ public class ParseOutputFormat implements 
OutputFormat<Text, Parse> {
         String fromUrl = key.toString();
         // host or domain name of the source URL
         String origin = null;
-        textOut.append(key, new ParseText(parse.getText()));
+        if (textOut!=null) {
+          textOut.append(key, new ParseText(parse.getText()));
+        }
 
         ParseData parseData = parse.getData();
         // recover the signature prepared by Fetcher or ParseSegment
@@ -311,7 +321,7 @@ public class ParseOutputFormat implements 
OutputFormat<Text, Parse> {
       }
 
       public void close(Reporter reporter) throws IOException {
-        textOut.close();
+        if (textOut!=null) textOut.close();
         dataOut.close();
         crawlOut.close();
       }

-- 
To stop receiving notification emails like this one, please contact
"[email protected]" <[email protected]>.

[nutch] 01/03: NUTCH-2435 - New parameter "parser.store.text" allowing to choose whether to store 'parse_text' directory or not.

Reply via email to