Repository: nutch
Updated Branches:
  refs/heads/master 76aedcb78 -> 9a9c4b32b


NUTCH-2359 Parsefilter-regex raises IndexOutOfBoundsException when rules are 
ill-formed


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/9a9c4b32
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/9a9c4b32
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/9a9c4b32

Branch: refs/heads/master
Commit: 9a9c4b32b9c1ab9c47583a217665e4694272d58a
Parents: 76aedcb
Author: Markus Jelsma <[email protected]>
Authored: Tue Feb 14 14:15:32 2017 +0100
Committer: Markus Jelsma <[email protected]>
Committed: Tue Feb 14 14:15:32 2017 +0100

----------------------------------------------------------------------
 src/plugin/parsefilter-regex/README.txt         | 41 ++++++++++++++++++++
 .../parsefilter/regex/RegexParseFilter.java     | 18 +++++----
 2 files changed, 52 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/9a9c4b32/src/plugin/parsefilter-regex/README.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-regex/README.txt 
b/src/plugin/parsefilter-regex/README.txt
new file mode 100644
index 0000000..1fac05f
--- /dev/null
+++ b/src/plugin/parsefilter-regex/README.txt
@@ -0,0 +1,41 @@
+Parsefilter-regex plugin
+
+Allow parsing and set custom defined fields using regex. Rules can be defined
+in a separate rule file or in the nutch configuration.
+
+If a rule file is used, should create a text file regex-parsefilter.txt (which
+is the default name of the rules file). To use a different filename, either
+update the file value in plugin’s build.xml or add parsefilter.regex.file
+config to the nutch config.
+
+ie:
+    <property>
+      <name>parsefilter.regex.file</name>
+      <value>
+       /path/to/rulefile
+      </value>
+    </property
+
+
+Format of rules: <name>\t<source>\t<regex>\n
+
+ie:
+       my_first_field          html    h1
+       my_second_field         text    my_pattern
+
+
+If a rule file is not used, rules can be directly set in the nutch config:
+
+ie:
+    <property>
+      <name>parsefilter.regex.rules</name>
+      <value>
+       my_first_field          html    h1
+       my_second_field         text    my_pattern
+      </value>
+    </property
+
+source can be either html or text. If source is html, the regex is applied to
+the entire HTML tree. If source is text, the regex is applied to the
+extracted text.
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/9a9c4b32/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
 
b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
index 6955166..f799e5f 100644
--- 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ 
b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -179,13 +179,17 @@ public class RegexParseFilter implements HtmlParseFilter {
     while ((line = reader.readLine()) != null) {
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
         line = line.trim();
-        String[] parts = line.split("\t");
-
-        String field = parts[0].trim();
-        String source = parts[1].trim();
-        String regex = parts[2].trim();
-        
-        rules.put(field, new RegexRule(source, regex));
+        String[] parts = line.split("\\s");
+
+        if (parts.length == 3) {
+            String field = parts[0].trim();
+            String source = parts[1].trim();
+            String regex = parts[2].trim();
+            
+            rules.put(field, new RegexRule(source, regex));
+        } else {
+            LOG.info("RegexParseFilter rule is invalid. " + line);
+        }
       }
     }
   }

Reply via email to