Repository: nutch Updated Branches: refs/heads/master 76aedcb78 -> 9a9c4b32b
NUTCH-2359 Parsefilter-regex raises IndexOutOfBoundsException when rules are ill-formed Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/9a9c4b32 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/9a9c4b32 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/9a9c4b32 Branch: refs/heads/master Commit: 9a9c4b32b9c1ab9c47583a217665e4694272d58a Parents: 76aedcb Author: Markus Jelsma <[email protected]> Authored: Tue Feb 14 14:15:32 2017 +0100 Committer: Markus Jelsma <[email protected]> Committed: Tue Feb 14 14:15:32 2017 +0100 ---------------------------------------------------------------------- src/plugin/parsefilter-regex/README.txt | 41 ++++++++++++++++++++ .../parsefilter/regex/RegexParseFilter.java | 18 +++++---- 2 files changed, 52 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/9a9c4b32/src/plugin/parsefilter-regex/README.txt ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-regex/README.txt b/src/plugin/parsefilter-regex/README.txt new file mode 100644 index 0000000..1fac05f --- /dev/null +++ b/src/plugin/parsefilter-regex/README.txt @@ -0,0 +1,41 @@ +Parsefilter-regex plugin + +Allow parsing and set custom defined fields using regex. Rules can be defined +in a separate rule file or in the nutch configuration. + +If a rule file is used, should create a text file regex-parsefilter.txt (which +is the default name of the rules file). To use a different filename, either +update the file value in pluginâs build.xml or add parsefilter.regex.file +config to the nutch config. + +ie: + <property> + <name>parsefilter.regex.file</name> + <value> + /path/to/rulefile + </value> + </property + + +Format of rules: <name>\t<source>\t<regex>\n + +ie: + my_first_field html h1 + my_second_field text my_pattern + + +If a rule file is not used, rules can be directly set in the nutch config: + +ie: + <property> + <name>parsefilter.regex.rules</name> + <value> + my_first_field html h1 + my_second_field text my_pattern + </value> + </property + +source can be either html or text. If source is html, the regex is applied to +the entire HTML tree. If source is text, the regex is applied to the +extracted text. + http://git-wip-us.apache.org/repos/asf/nutch/blob/9a9c4b32/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java index 6955166..f799e5f 100644 --- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java +++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java @@ -179,13 +179,17 @@ public class RegexParseFilter implements HtmlParseFilter { while ((line = reader.readLine()) != null) { if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { line = line.trim(); - String[] parts = line.split("\t"); - - String field = parts[0].trim(); - String source = parts[1].trim(); - String regex = parts[2].trim(); - - rules.put(field, new RegexRule(source, regex)); + String[] parts = line.split("\\s"); + + if (parts.length == 3) { + String field = parts[0].trim(); + String source = parts[1].trim(); + String regex = parts[2].trim(); + + rules.put(field, new RegexRule(source, regex)); + } else { + LOG.info("RegexParseFilter rule is invalid. " + line); + } } } }
