This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new caea3a0 NUTCH-2772 Debugging parse filter to show serialized DOM tree new 3665345 Merge pull request #500 from sebastian-nagel/NUTCH-2772-parsefilter-debug caea3a0 is described below commit caea3a051aceb947d17ccfaa080f6bd864802a4d Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Thu Feb 27 17:14:06 2020 +0100 NUTCH-2772 Debugging parse filter to show serialized DOM tree --- build.xml | 3 + default.properties | 1 + src/java/org/apache/nutch/util/DomUtil.java | 24 +++++--- src/plugin/build.xml | 2 + src/plugin/parsefilter-debug/build.xml | 22 +++++++ src/plugin/parsefilter-debug/ivy.xml | 37 ++++++++++++ src/plugin/parsefilter-debug/plugin.xml | 41 +++++++++++++ .../nutch/parsefilter/debug/DebugParseFilter.java | 68 ++++++++++++++++++++++ .../nutch/parsefilter/debug/package-info.java | 23 ++++++++ 9 files changed, 213 insertions(+), 8 deletions(-) diff --git a/build.xml b/build.xml index ae0f111..b54e713 100644 --- a/build.xml +++ b/build.xml @@ -210,6 +210,7 @@ <packageset dir="${plugins.dir}/parse-swf/src/java"/> <packageset dir="${plugins.dir}/parse-tika/src/java"/> <packageset dir="${plugins.dir}/parse-zip/src/java"/> + <packageset dir="${plugins.dir}/parsefilter-debug/src/java"/> <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/> <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/> <packageset dir="${plugins.dir}/protocol-file/src/java"/> @@ -719,6 +720,7 @@ <packageset dir="${plugins.dir}/parse-swf/src/java"/> <packageset dir="${plugins.dir}/parse-tika/src/java"/> <packageset dir="${plugins.dir}/parse-zip/src/java"/> + <packageset dir="${plugins.dir}/parsefilter-debug/src/java"/> <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/> <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/> <packageset dir="${plugins.dir}/protocol-file/src/java"/> @@ -1131,6 +1133,7 @@ <source path="${plugins.dir}/parse-tika/src/test/" /> <source path="${plugins.dir}/parse-zip/src/java/" /> <source path="${plugins.dir}/parse-zip/src/test/" /> + <source path="${plugins.dir}/parsefilter-debug/src/java/" /> <source path="${plugins.dir}/parsefilter-naivebayes/src/java/" /> <source path="${plugins.dir}/parsefilter-regex/src/java/" /> <source path="${plugins.dir}/parsefilter-regex/src/test/" /> diff --git a/default.properties b/default.properties index 668f938..1537a01 100644 --- a/default.properties +++ b/default.properties @@ -153,6 +153,7 @@ plugins.parse=\ # Parse Filter Plugins # plugins.parsefilter=\ + org.apache.nutch.parsefilter.debug*:\ org.apache.nutch.parse.headings*:\ org.apache.nutch.parsefilter.naivebayes*:\ org.apache.nutch.parsefilter.regex*:\ diff --git a/src/java/org/apache/nutch/util/DomUtil.java b/src/java/org/apache/nutch/util/DomUtil.java index 2461286..d0bfafd 100644 --- a/src/java/org/apache/nutch/util/DomUtil.java +++ b/src/java/org/apache/nutch/util/DomUtil.java @@ -22,7 +22,9 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.lang.invoke.MethodHandles; +import java.nio.charset.StandardCharsets; +import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; @@ -33,6 +35,7 @@ import javax.xml.transform.stream.StreamResult; import org.apache.xerces.parsers.DOMParser; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; +import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -91,16 +94,12 @@ public class DomUtil { try { transformer = transFactory.newTransformer(); transformer.setOutputProperty("indent", "yes"); + transformer.setOutputProperty(OutputKeys.ENCODING, + StandardCharsets.UTF_8.name()); StreamResult result = new StreamResult(os); transformer.transform(source, result); os.flush(); - } catch (UnsupportedEncodingException e1) { - LOG.error("Error: ", e1); - } catch (IOException e1) { - LOG.error("Error: ", e1); - } catch (TransformerConfigurationException e2) { - LOG.error("Error: ", e2); - } catch (TransformerException ex) { + } catch (IOException | TransformerException ex) { LOG.error("Error: ", ex); } } @@ -108,7 +107,16 @@ public class DomUtil { public static void saveDom(OutputStream os, DocumentFragment doc) { NodeList docChildren = doc.getChildNodes(); for (int i = 0; i < docChildren.getLength(); i++) { - saveDom(os, (Element) docChildren.item(i)); + Node child = docChildren.item(i); + if (child instanceof Element) { + saveDom(os, (Element) child); + } else { + try { + os.write(child.toString().getBytes(StandardCharsets.UTF_8)); + } catch (IOException ex) { + LOG.error("Error: ", ex); + } + } } } } diff --git a/src/plugin/build.xml b/src/plugin/build.xml index b0882a5..581a37a 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -68,6 +68,7 @@ <ant dir="parse-swf" target="deploy"/> <ant dir="parse-tika" target="deploy"/> <ant dir="parse-zip" target="deploy"/> + <ant dir="parsefilter-debug" target="deploy"/> <ant dir="parsefilter-naivebayes" target="deploy"/> <ant dir="parsefilter-regex" target="deploy"/> <ant dir="protocol-file" target="deploy"/> @@ -214,6 +215,7 @@ <ant dir="parse-swf" target="clean"/> <ant dir="parse-tika" target="clean"/> <ant dir="parse-zip" target="clean"/> + <ant dir="parsefilter-debug" target="clean" /> <ant dir="parsefilter-naivebayes" target="clean" /> <ant dir="parsefilter-regex" target="clean"/> <ant dir="protocol-file" target="clean"/> diff --git a/src/plugin/parsefilter-debug/build.xml b/src/plugin/parsefilter-debug/build.xml new file mode 100644 index 0000000..1f175e4 --- /dev/null +++ b/src/plugin/parsefilter-debug/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parsefilter-debug" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> diff --git a/src/plugin/parsefilter-debug/ivy.xml b/src/plugin/parsefilter-debug/ivy.xml new file mode 100644 index 0000000..dac80e6 --- /dev/null +++ b/src/plugin/parsefilter-debug/ivy.xml @@ -0,0 +1,37 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="https://nutch.apache.org/"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> +</ivy-module> diff --git a/src/plugin/parsefilter-debug/plugin.xml b/src/plugin/parsefilter-debug/plugin.xml new file mode 100644 index 0000000..bc4a574 --- /dev/null +++ b/src/plugin/parsefilter-debug/plugin.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parsefilter-debug" + name="Debugging Parse Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parsefilter-debug.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.htmlparsefilter.regex" + name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="DebugParseFilter" + class="org.apache.nutch.parsefilter.debug.DebugParseFilter"> + </implementation> + </extension> + +</plugin> diff --git a/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java new file mode 100644 index 0000000..691f894 --- /dev/null +++ b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parsefilter.debug; + +import java.io.ByteArrayOutputStream; +import java.io.OutputStreamWriter; +import java.lang.invoke.MethodHandles; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.DomUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.DocumentFragment; + +/** + * Adds serialized DOM to parse data, useful for debugging, to understand how + * the parser implementation interprets a document (not only HTML). + */ +public class DebugParseFilter implements HtmlParseFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private Configuration conf; + + @Override + public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DomUtil.saveDom(baos, doc); + Parse parse = parseResult.get(content.getUrl()); + String dom = new String(baos.toByteArray(), StandardCharsets.UTF_8); + LOG.debug(dom); + parse.getData().getParseMeta().set("DOM", dom); + return parseResult; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + public Configuration getConf() { + return conf; + } +} diff --git a/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java new file mode 100644 index 0000000..bbc24dd --- /dev/null +++ b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Adds serialized DOM to parse data, useful for debugging, to understand how + * the parser implementation interprets a document (not only HTML). + */ +package org.apache.nutch.parsefilter.debug; +