Author: markus Date: Tue Feb 23 12:58:54 2016 New Revision: 1731849 URL: http://svn.apache.org/viewvc?rev=1731849&view=rev Log: NUTCH-2227 RegexParseFilter
Added: nutch/trunk/conf/regex-parsefilter.txt nutch/trunk/src/plugin/parsefilter-regex/ nutch/trunk/src/plugin/parsefilter-regex/build.xml nutch/trunk/src/plugin/parsefilter-regex/data/ nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt nutch/trunk/src/plugin/parsefilter-regex/ivy.xml nutch/trunk/src/plugin/parsefilter-regex/plugin.xml nutch/trunk/src/plugin/parsefilter-regex/src/ nutch/trunk/src/plugin/parsefilter-regex/src/java/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java nutch/trunk/src/plugin/parsefilter-regex/src/test/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/build.xml nutch/trunk/default.properties nutch/trunk/src/plugin/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731849&r1=1731848&r2=1731849&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Feb 23 12:58:54 2016 @@ -10,6 +10,8 @@ in the release announcement and keep it Nutch Change Log +* NUTCH-2227 RegexParseFilter (markus) + * NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus) * NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* (markus) Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1731849&r1=1731848&r2=1731849&view=diff ============================================================================== --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Tue Feb 23 12:58:54 2016 @@ -200,6 +200,7 @@ <packageset dir="${plugins.dir}/parse-swf/src/java"/> <packageset dir="${plugins.dir}/parse-tika/src/java"/> <packageset dir="${plugins.dir}/parse-zip/src/java"/> + <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/> <packageset dir="${plugins.dir}/protocol-file/src/java"/> <packageset dir="${plugins.dir}/protocol-ftp/src/java"/> <packageset dir="${plugins.dir}/protocol-http/src/java"/> @@ -637,6 +638,7 @@ <packageset dir="${plugins.dir}/parse-metatags/src/java"/> <packageset dir="${plugins.dir}/parse-swf/src/java"/> <packageset dir="${plugins.dir}/parse-tika/src/java"/> + <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/> <packageset dir="${plugins.dir}/parse-zip/src/java"/> <packageset dir="${plugins.dir}/protocol-file/src/java"/> <packageset dir="${plugins.dir}/protocol-ftp/src/java"/> @@ -1048,6 +1050,8 @@ <source path="${plugins.dir}/parse-tika/src/test/" /> <source path="${plugins.dir}/parse-zip/src/java/" /> <source path="${plugins.dir}/parse-zip/src/test/" /> + <source path="${plugins.dir}/parsefilter-regex/src/java/" /> + <source path="${plugins.dir}/parsefilter-regex/src/test/" /> <source path="${plugins.dir}/protocol-file/src/java/" /> <source path="${plugins.dir}/protocol-file/src/test/" /> <source path="${plugins.dir}/protocol-ftp/src/java/" /> Added: nutch/trunk/conf/regex-parsefilter.txt URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-parsefilter.txt?rev=1731849&view=auto ============================================================================== --- nutch/trunk/conf/regex-parsefilter.txt (added) +++ nutch/trunk/conf/regex-parsefilter.txt Tue Feb 23 12:58:54 2016 @@ -0,0 +1,8 @@ +# Example configuration file for parsefilter-regex +# +# Parse metadata field <name> is set to true if the HTML matches the regex. The +# source can either be html or text. If source is html, the regex is applied to +# the entire HTML tree. If source is text, the regex is applied to the +# extracted text. +# +# format: <name>\t<source>\t<regex>\n Modified: nutch/trunk/default.properties URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1731849&r1=1731848&r2=1731849&view=diff ============================================================================== --- nutch/trunk/default.properties (original) +++ nutch/trunk/default.properties Tue Feb 23 12:58:54 2016 @@ -143,6 +143,7 @@ plugins.parse=\ plugins.parsefilter=\ org.apache.nutch.parse.headings*:\ org.apache.nutch.parsefilter.naivebayes*:\ + org.apache.nutch.parsefilter.regex*:\ org.apache.nutch.parse.metatags* # Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1731849&r1=1731848&r2=1731849&view=diff ============================================================================== --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Tue Feb 23 12:58:54 2016 @@ -77,6 +77,7 @@ <ant dir="urlfilter-suffix" target="deploy"/> <ant dir="urlfilter-validator" target="deploy"/> <ant dir="parsefilter-naivebayes" target="deploy"/> + <ant dir="parsefilter-regex" target="deploy"/> <ant dir="urlmeta" target="deploy"/> <ant dir="urlnormalizer-ajax" target="deploy"/> <ant dir="urlnormalizer-basic" target="deploy"/> @@ -114,6 +115,7 @@ <ant dir="parse-swf" target="test"/> <ant dir="parse-tika" target="test"/> <ant dir="parse-zip" target="test"/> + <ant dir="parsefilter-regex" target="test"/> <ant dir="subcollection" target="test"/> <ant dir="urlfilter-automaton" target="test"/> <ant dir="urlfilter-domain" target="test"/> @@ -176,6 +178,7 @@ <ant dir="parse-swf" target="clean"/> <ant dir="parse-tika" target="clean"/> <ant dir="parse-zip" target="clean"/> + <ant dir="parsefilter-regex" target="clean"/> <ant dir="scoring-depth" target="clean"/> <ant dir="scoring-opic" target="clean"/> <ant dir="scoring-link" target="clean"/> Added: nutch/trunk/src/plugin/parsefilter-regex/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/build.xml?rev=1731849&view=auto ============================================================================== --- nutch/trunk/src/plugin/parsefilter-regex/build.xml (added) +++ nutch/trunk/src/plugin/parsefilter-regex/build.xml Tue Feb 23 12:58:54 2016 @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parsefilter-regex" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> +</project> Added: nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt?rev=1731849&view=auto ============================================================================== --- nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt (added) +++ nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt Tue Feb 23 12:58:54 2016 @@ -0,0 +1,10 @@ +# Example configuration file for parsefilter-regex +# +# Parse metadata field <name> is set to true if the HTML matches the regex. The +# source can either be html or text. If source is html, the regex is applied to +# the entire HTML tree. If source is text, the regex is applied to the +# extracted text. +# +# format: <name>\t<source>\t<regex>\n +first html h1 +second text blablabla Added: nutch/trunk/src/plugin/parsefilter-regex/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/ivy.xml?rev=1731849&view=auto ============================================================================== --- nutch/trunk/src/plugin/parsefilter-regex/ivy.xml (added) +++ nutch/trunk/src/plugin/parsefilter-regex/ivy.xml Tue Feb 23 12:58:54 2016 @@ -0,0 +1,37 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> +</ivy-module> Added: nutch/trunk/src/plugin/parsefilter-regex/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/plugin.xml?rev=1731849&view=auto ============================================================================== --- nutch/trunk/src/plugin/parsefilter-regex/plugin.xml (added) +++ nutch/trunk/src/plugin/parsefilter-regex/plugin.xml Tue Feb 23 12:58:54 2016 @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parsefilter-regex" + name="Regex Parse Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parsefilter-regex.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.htmlparsefilter.regex" + name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="RegexParseFilter" + class="org.apache.nutch.parsefilter.regex.RegexParseFilter"> + <parameter name="file" value="regex-parsefilter.txt"/> + </implementation> + </extension> + +</plugin> Added: nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java?rev=1731849&view=auto ============================================================================== --- nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java (added) +++ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java Tue Feb 23 12:58:54 2016 @@ -0,0 +1,199 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parsefilter.regex; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.FileReader; +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.protocol.Content; + +import org.apache.commons.lang.StringUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.w3c.dom.*; + +/** + * RegexParseFilter. If a regular expression matches either HTML or + * extracted text, a configurable field is set to true. + */ +public class RegexParseFilter implements HtmlParseFilter { + + private static final Logger LOG = LoggerFactory.getLogger(RegexParseFilter.class); + private static String attributeFile = null; + private String regexFile = null; + + private Configuration conf; + private DocumentFragment doc; + + private static final Map<String,RegexRule> rules = new HashMap<String,RegexRule>(); + + public RegexParseFilter() {} + + public RegexParseFilter(String regexFile) { + this.regexFile = regexFile; + } + + public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { + Parse parse = parseResult.get(content.getUrl()); + String html = new String(content.getContent()); + String text = parse.getText(); + + for (Map.Entry<String, RegexRule> entry : rules.entrySet()) { + String field = entry.getKey(); + RegexRule regexRule = entry.getValue(); + + String source = null; + if (regexRule.source.equalsIgnoreCase("html")) { + source = html; + } + if (regexRule.source.equalsIgnoreCase("text")) { + source = text; + } + + if (source == null) { + LOG.error("source for regex rule: " + field + " misconfigured"); + } + + if (matches(source, regexRule.regex)) { + parse.getData().getParseMeta().set(field, "true"); + } else { + parse.getData().getParseMeta().set(field, "false"); + } + } + + return parseResult; + } + + public void setConf(Configuration conf) { + this.conf = conf; + + // get the extensions for domain urlfilter + String pluginName = "parsefilter-regex"; + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( + HtmlParseFilter.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + + // handle blank non empty input + if (attributeFile != null && attributeFile.trim().equals("")) { + attributeFile = null; + } + + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } + else { + if (LOG.isWarnEnabled()) { + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + + pluginName); + } + } + + // domain file and attribute "file" take precedence if defined + String file = conf.get("parsefilter.regex.file"); + String stringRules = conf.get("parsefilter.regex.rules"); + if (regexFile != null) { + file = regexFile; + } + else if (attributeFile != null) { + file = attributeFile; + } + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + try { + if (reader == null) { + reader = new FileReader(file); + } + readConfiguration(reader); + } + catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + public Configuration getConf() { + return this.conf; + } + + private boolean matches(String value, Pattern pattern) { + if (value != null) { + Matcher matcher = pattern.matcher(value); + return matcher.find(); + } + + return false; + } + + private synchronized void readConfiguration(Reader configReader) throws IOException { + if (rules.size() > 0) { + return; + } + + String line; + BufferedReader reader = new BufferedReader(configReader); + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + line = line.trim(); + String[] parts = line.split("\t"); + + String field = parts[0].trim(); + String source = parts[1].trim(); + String regex = parts[2].trim(); + + rules.put(field, new RegexRule(source, regex)); + } + } + } + + private static class RegexRule { + public RegexRule(String source, String regex) { + this.source = source; + this.regex = Pattern.compile(regex); + } + String source; + Pattern regex; + } +} \ No newline at end of file Added: nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java?rev=1731849&view=auto ============================================================================== --- nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java (added) +++ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java Tue Feb 23 12:58:54 2016 @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * RegexParseFilter. If a regular expression matches either HTML or + * extracted text, a configurable field is set to true. + */ +package org.apache.nutch.parsefilter.regex; + Added: nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java?rev=1731849&view=auto ============================================================================== --- nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java (added) +++ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java Tue Feb 23 12:58:54 2016 @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parsefilter.regex; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import junit.framework.TestCase; + +public class TestRegexParseFilter extends TestCase { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + public void testPositiveFilter() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; + RegexParseFilter filter = new RegexParseFilter(file); + filter.setConf(conf); + + String url = "http://nutch.apache.org/"; + String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>"; + Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf); + Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData()); + + ParseResult result = ParseResult.createParseResult(url, parse); + result = filter.filter(content, result, null, null); + + Metadata meta = parse.getData().getParseMeta(); + + assertEquals("true", meta.get("first")); + assertEquals("true", meta.get("second")); + } + + public void testNegativeFilter() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; + RegexParseFilter filter = new RegexParseFilter(file); + filter.setConf(conf); + + String url = "http://nutch.apache.org/"; + String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>"; + Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf); + Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData()); + + ParseResult result = ParseResult.createParseResult(url, parse); + result = filter.filter(content, result, null, null); + + Metadata meta = parse.getData().getParseMeta(); + + assertEquals("false", meta.get("first")); + assertEquals("false", meta.get("second")); + } +} \ No newline at end of file