Author: markus
Date: Tue Feb 23 12:58:54 2016
New Revision: 1731849

URL: http://svn.apache.org/viewvc?rev=1731849&view=rev
Log:
NUTCH-2227 RegexParseFilter

Added:
    nutch/trunk/conf/regex-parsefilter.txt
    nutch/trunk/src/plugin/parsefilter-regex/
    nutch/trunk/src/plugin/parsefilter-regex/build.xml
    nutch/trunk/src/plugin/parsefilter-regex/data/
    nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
    nutch/trunk/src/plugin/parsefilter-regex/ivy.xml
    nutch/trunk/src/plugin/parsefilter-regex/plugin.xml
    nutch/trunk/src/plugin/parsefilter-regex/src/
    nutch/trunk/src/plugin/parsefilter-regex/src/java/
    nutch/trunk/src/plugin/parsefilter-regex/src/java/org/
    nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/
    nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/
    
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/
    
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/
    
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
    
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java
    nutch/trunk/src/plugin/parsefilter-regex/src/test/
    nutch/trunk/src/plugin/parsefilter-regex/src/test/org/
    nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/
    nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/
    
nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/
    
nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/
    
nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/build.xml
    nutch/trunk/default.properties
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731849&r1=1731848&r2=1731849&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 23 12:58:54 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
 
 Nutch Change Log
 
+* NUTCH-2227 RegexParseFilter (markus)
+
 * NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus)
 
 * NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* (markus)

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1731849&r1=1731848&r2=1731849&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Tue Feb 23 12:58:54 2016
@@ -200,6 +200,7 @@
       <packageset dir="${plugins.dir}/parse-swf/src/java"/>
       <packageset dir="${plugins.dir}/parse-tika/src/java"/>
       <packageset dir="${plugins.dir}/parse-zip/src/java"/>
+      <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/protocol-file/src/java"/>
       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
@@ -637,6 +638,7 @@
       <packageset dir="${plugins.dir}/parse-metatags/src/java"/>
       <packageset dir="${plugins.dir}/parse-swf/src/java"/>
       <packageset dir="${plugins.dir}/parse-tika/src/java"/>
+      <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/parse-zip/src/java"/>
       <packageset dir="${plugins.dir}/protocol-file/src/java"/>
       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
@@ -1048,6 +1050,8 @@
         <source path="${plugins.dir}/parse-tika/src/test/" />
         <source path="${plugins.dir}/parse-zip/src/java/" />
         <source path="${plugins.dir}/parse-zip/src/test/" />
+        <source path="${plugins.dir}/parsefilter-regex/src/java/" />
+        <source path="${plugins.dir}/parsefilter-regex/src/test/" />    
         <source path="${plugins.dir}/protocol-file/src/java/" />
         <source path="${plugins.dir}/protocol-file/src/test/" />
         <source path="${plugins.dir}/protocol-ftp/src/java/" />

Added: nutch/trunk/conf/regex-parsefilter.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/regex-parsefilter.txt?rev=1731849&view=auto
==============================================================================
--- nutch/trunk/conf/regex-parsefilter.txt (added)
+++ nutch/trunk/conf/regex-parsefilter.txt Tue Feb 23 12:58:54 2016
@@ -0,0 +1,8 @@
+# Example configuration file for parsefilter-regex
+#
+# Parse metadata field <name> is set to true if the HTML matches the regex. The
+# source can either be html or text. If source is html, the regex is applied to
+# the entire HTML tree. If source is text, the regex is applied to the
+# extracted text.
+#
+# format: <name>\t<source>\t<regex>\n

Modified: nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1731849&r1=1731848&r2=1731849&view=diff
==============================================================================
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Tue Feb 23 12:58:54 2016
@@ -143,6 +143,7 @@ plugins.parse=\
 plugins.parsefilter=\
    org.apache.nutch.parse.headings*:\
    org.apache.nutch.parsefilter.naivebayes*:\
+   org.apache.nutch.parsefilter.regex*:\
    org.apache.nutch.parse.metatags*
 
 #

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1731849&r1=1731848&r2=1731849&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Tue Feb 23 12:58:54 2016
@@ -77,6 +77,7 @@
      <ant dir="urlfilter-suffix" target="deploy"/>
      <ant dir="urlfilter-validator" target="deploy"/>
      <ant dir="parsefilter-naivebayes" target="deploy"/>
+     <ant dir="parsefilter-regex" target="deploy"/>
      <ant dir="urlmeta" target="deploy"/>
      <ant dir="urlnormalizer-ajax" target="deploy"/>
      <ant dir="urlnormalizer-basic" target="deploy"/>
@@ -114,6 +115,7 @@
      <ant dir="parse-swf" target="test"/>
      <ant dir="parse-tika" target="test"/>
      <ant dir="parse-zip" target="test"/>
+     <ant dir="parsefilter-regex" target="test"/>
      <ant dir="subcollection" target="test"/>
      <ant dir="urlfilter-automaton" target="test"/>
      <ant dir="urlfilter-domain" target="test"/>
@@ -176,6 +178,7 @@
     <ant dir="parse-swf" target="clean"/>
     <ant dir="parse-tika" target="clean"/>
     <ant dir="parse-zip" target="clean"/>
+    <ant dir="parsefilter-regex" target="clean"/>
     <ant dir="scoring-depth" target="clean"/>
     <ant dir="scoring-opic" target="clean"/>
     <ant dir="scoring-link" target="clean"/>

Added: nutch/trunk/src/plugin/parsefilter-regex/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/build.xml?rev=1731849&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-regex/build.xml (added)
+++ nutch/trunk/src/plugin/parsefilter-regex/build.xml Tue Feb 23 12:58:54 2016
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-regex" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+</project>

Added: nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt?rev=1731849&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt (added)
+++ nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt Tue Feb 
23 12:58:54 2016
@@ -0,0 +1,10 @@
+# Example configuration file for parsefilter-regex
+#
+# Parse metadata field <name> is set to true if the HTML matches the regex. The
+# source can either be html or text. If source is html, the regex is applied to
+# the entire HTML tree. If source is text, the regex is applied to the
+# extracted text.
+#
+# format: <name>\t<source>\t<regex>\n
+first  html    h1
+second text    blablabla

Added: nutch/trunk/src/plugin/parsefilter-regex/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/ivy.xml?rev=1731849&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-regex/ivy.xml (added)
+++ nutch/trunk/src/plugin/parsefilter-regex/ivy.xml Tue Feb 23 12:58:54 2016
@@ -0,0 +1,37 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>  
+</ivy-module>

Added: nutch/trunk/src/plugin/parsefilter-regex/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/plugin.xml?rev=1731849&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-regex/plugin.xml (added)
+++ nutch/trunk/src/plugin/parsefilter-regex/plugin.xml Tue Feb 23 12:58:54 2016
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parsefilter-regex"
+   name="Regex Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parsefilter-regex.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.htmlparsefilter.regex"
+        name="Nutch Parser Filter" 
point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="RegexParseFilter" 
+                      
class="org.apache.nutch.parsefilter.regex.RegexParseFilter">
+          <parameter name="file" value="regex-parsefilter.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

Added: 
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java?rev=1731849&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
 (added)
+++ 
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
 Tue Feb 23 12:58:54 2016
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.regex;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.FileReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.protocol.Content;
+
+import org.apache.commons.lang.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.w3c.dom.*;
+
+/**
+ * RegexParseFilter. If a regular expression matches either HTML or 
+ * extracted text, a configurable field is set to true.
+ */
+public class RegexParseFilter implements HtmlParseFilter {
+  
+  private static final Logger LOG = 
LoggerFactory.getLogger(RegexParseFilter.class);
+  private static String attributeFile = null;
+  private String regexFile = null;
+  
+  private Configuration conf;
+  private DocumentFragment doc;
+  
+  private static final Map<String,RegexRule> rules = new 
HashMap<String,RegexRule>();
+  
+  public RegexParseFilter() {}
+  
+  public RegexParseFilter(String regexFile) {
+    this.regexFile = regexFile;
+  }
+
+  public ParseResult filter(Content content, ParseResult parseResult, 
HTMLMetaTags metaTags, DocumentFragment doc) {
+    Parse parse = parseResult.get(content.getUrl());
+    String html = new String(content.getContent());
+    String text = parse.getText();
+    
+    for (Map.Entry<String, RegexRule> entry : rules.entrySet()) {
+      String field = entry.getKey();
+      RegexRule regexRule = entry.getValue();
+      
+      String source = null;
+      if (regexRule.source.equalsIgnoreCase("html")) {
+        source = html;
+      }
+      if (regexRule.source.equalsIgnoreCase("text")) {
+        source = text;
+      }
+      
+      if (source == null) {
+        LOG.error("source for regex rule: " + field + " misconfigured");
+      }
+      
+      if (matches(source, regexRule.regex)) {
+        parse.getData().getParseMeta().set(field, "true");
+      } else {
+        parse.getData().getParseMeta().set(field, "false");
+      }
+    }
+    
+    return parseResult;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "parsefilter-regex";
+    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+      HtmlParseFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+          + " as " + attributeFile);
+      }
+    }
+    else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+          + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("parsefilter.regex.file");
+    String stringRules = conf.get("parsefilter.regex.rules");
+    if (regexFile != null) {
+      file = regexFile;
+    }
+    else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    }
+    catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+  
+  private boolean matches(String value, Pattern pattern) {
+    if (value != null) {
+      Matcher matcher = pattern.matcher(value);
+      return matcher.find();
+    }
+       
+    return false;
+  }
+  
+  private synchronized void readConfiguration(Reader configReader) throws 
IOException {
+    if (rules.size() > 0) {
+      return;
+    }
+
+    String line;
+    BufferedReader reader = new BufferedReader(configReader);
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        line = line.trim();
+        String[] parts = line.split("\t");
+
+        String field = parts[0].trim();
+        String source = parts[1].trim();
+        String regex = parts[2].trim();
+        
+        rules.put(field, new RegexRule(source, regex));
+      }
+    }
+  }
+  
+  private static class RegexRule {
+    public RegexRule(String source, String regex) {
+      this.source = source;
+      this.regex = Pattern.compile(regex);
+    }
+    String source;
+    Pattern regex;
+  }
+}
\ No newline at end of file

Added: 
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java?rev=1731849&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java
 (added)
+++ 
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java
 Tue Feb 23 12:58:54 2016
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * RegexParseFilter. If a regular expression matches either HTML or 
+ * extracted text, a configurable field is set to true.
+ */
+package org.apache.nutch.parsefilter.regex;
+

Added: 
nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java?rev=1731849&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
 (added)
+++ 
nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
 Tue Feb 23 12:58:54 2016
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.regex;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import junit.framework.TestCase;
+
+public class TestRegexParseFilter extends TestCase {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  public void testPositiveFilter() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
+    RegexParseFilter filter = new RegexParseFilter(file);
+    filter.setConf(conf);
+
+    String url = "http://nutch.apache.org/";;
+    String html = "<body><html><h1>nutch</h1><p>this is the extracted text 
blablabla</p></body></html>";
+    Content content = new Content(url, url, html.getBytes("UTF-8"), 
"text/html", new Metadata(), conf);
+    Parse parse = new ParseImpl("nutch this is the extracted text blablabla", 
new ParseData());
+    
+    ParseResult result = ParseResult.createParseResult(url, parse);
+    result = filter.filter(content, result, null, null);
+
+    Metadata meta = parse.getData().getParseMeta();
+    
+    assertEquals("true", meta.get("first"));
+    assertEquals("true", meta.get("second"));
+  }
+  
+  public void testNegativeFilter() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
+    RegexParseFilter filter = new RegexParseFilter(file);
+    filter.setConf(conf);
+
+    String url = "http://nutch.apache.org/";;
+    String html = "<body><html><h2>nutch</h2><p>this is the extracted text no 
bla</p></body></html>";
+    Content content = new Content(url, url, html.getBytes("UTF-8"), 
"text/html", new Metadata(), conf);
+    Parse parse = new ParseImpl("nutch this is the extracted text bla", new 
ParseData());
+    
+    ParseResult result = ParseResult.createParseResult(url, parse);
+    result = filter.filter(content, result, null, null);
+
+    Metadata meta = parse.getData().getParseMeta();
+    
+    assertEquals("false", meta.get("first"));
+    assertEquals("false", meta.get("second"));
+  }
+}
\ No newline at end of file


Reply via email to