http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-link/plugin.xml b/nutch-plugins/scoring-link/plugin.xml new file mode 100644 index 0000000..2b1c1e1 --- /dev/null +++ b/nutch-plugins/scoring-link/plugin.xml @@ -0,0 +1,39 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="scoring-link" + name="Link Analysis Scoring Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="scoring-link.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="org.apache.nutch.scoring.link" + name="LinkAnalysisScoring" + point="org.apache.nutch.scoring.ScoringFilter"> + + <implementation id="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter" + class="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter" /> + </extension> + +</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-link/pom.xml b/nutch-plugins/scoring-link/pom.xml new file mode 100644 index 0000000..3c7041e --- /dev/null +++ b/nutch-plugins/scoring-link/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>scoring-link</artifactId> + <packaging>jar</packaging> + + <name>scoring-link</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java new file mode 100644 index 0000000..a143f46 --- /dev/null +++ b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.scoring.link; + +import java.util.Collection; +import java.util.List; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.scoring.ScoringFilter; +import org.apache.nutch.scoring.ScoringFilterException; + +public class LinkAnalysisScoringFilter implements ScoringFilter { + + private Configuration conf; + private float normalizedScore = 1.00f; + + public LinkAnalysisScoringFilter() { + + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f); + } + + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { + return adjust; + } + + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException { + return datum.getScore() * initSort; + } + + public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException { + return (normalizedScore * dbDatum.getScore()); + } + + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + datum.setScore(0.0f); + } + + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + } + + public void passScoreAfterParsing(Text url, Content content, Parse parse) + throws ScoringFilterException { + parse.getData().getContentMeta() + .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); + } + + public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) + throws ScoringFilterException { + content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore()); + } + + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List<CrawlDatum> inlinked) throws ScoringFilterException { + // nothing to do + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java new file mode 100644 index 0000000..9dc0c35 --- /dev/null +++ b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Scoring filter used in conjunction with + * {@link org.apache.nutch.scoring.webgraph.WebGraph}. + */ +package org.apache.nutch.scoring.link; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-opic/build.xml b/nutch-plugins/scoring-opic/build.xml new file mode 100644 index 0000000..137dab4 --- /dev/null +++ b/nutch-plugins/scoring-opic/build.xml @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="scoring-opic" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-opic/ivy.xml b/nutch-plugins/scoring-opic/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/scoring-opic/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-opic/plugin.xml b/nutch-plugins/scoring-opic/plugin.xml new file mode 100644 index 0000000..3805a31 --- /dev/null +++ b/nutch-plugins/scoring-opic/plugin.xml @@ -0,0 +1,39 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="scoring-opic" + name="OPIC Scoring Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="scoring-opic.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="org.apache.nutch.scoring.opic" + name="OPICScoring" + point="org.apache.nutch.scoring.ScoringFilter"> + + <implementation id="org.apache.nutch.scoring.opic.OPICScoringFilter" + class="org.apache.nutch.scoring.opic.OPICScoringFilter" /> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-opic/pom.xml b/nutch-plugins/scoring-opic/pom.xml new file mode 100644 index 0000000..58e0786 --- /dev/null +++ b/nutch-plugins/scoring-opic/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>scoring-opic</artifactId> + <packaging>jar</packaging> + + <name>scoring-opic</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java new file mode 100644 index 0000000..e943d06 --- /dev/null +++ b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java @@ -0,0 +1,173 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.scoring.opic; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Collection; +import java.util.List; +import java.util.Map.Entry; + +// Slf4j Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.scoring.ScoringFilter; +import org.apache.nutch.scoring.ScoringFilterException; + +/** + * This plugin implements a variant of an Online Page Importance Computation + * (OPIC) score, described in this paper: <a + * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/> + * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive + * On-Line Page Importance Computation </a>. + * + * @author Andrzej Bialecki + */ +public class OPICScoringFilter implements ScoringFilter { + + private final static Logger LOG = LoggerFactory + .getLogger(OPICScoringFilter.class); + + private Configuration conf; + private float scoreInjected; + private float scorePower; + private float internalScoreFactor; + private float externalScoreFactor; + private boolean countFiltered; + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + scorePower = conf.getFloat("indexer.score.power", 0.5f); + internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f); + externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f); + countFiltered = conf.getBoolean("db.score.count.filtered", false); + } + + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + } + + /** + * Set to 0.0f (unknown value) - inlink contributions will bring it to a + * correct level. Newly discovered pages have at least one inlink. + */ + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + datum.setScore(0.0f); + } + + /** Use {@link CrawlDatum#getScore()}. */ + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException { + return datum.getScore() * initSort; + } + + /** Increase the score by a sum of inlinked scores. */ + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List<CrawlDatum> inlinked) throws ScoringFilterException { + float adjust = 0.0f; + for (int i = 0; i < inlinked.size(); i++) { + CrawlDatum linked = inlinked.get(i); + adjust += linked.getScore(); + } + if (old == null) + old = datum; + datum.setScore(old.getScore() + adjust); + } + + /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */ + public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) { + content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore()); + } + + /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */ + public void passScoreAfterParsing(Text url, Content content, Parse parse) { + parse.getData().getContentMeta() + .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); + } + + /** + * Get a float value from Fetcher.SCORE_KEY, divide it by the number of + * outlinks and apply. + */ + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { + float score = scoreInjected; + String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY); + if (scoreString != null) { + try { + score = Float.parseFloat(scoreString); + } catch (Exception e) { + LOG.error("Error: ", e); + } + } + int validCount = targets.size(); + if (countFiltered) { + score /= allCount; + } else { + if (validCount == 0) { + // no outlinks to distribute score, so just return adjust + return adjust; + } + score /= validCount; + } + // internal and external score factor + float internalScore = score * internalScoreFactor; + float externalScore = score * externalScoreFactor; + for (Entry<Text, CrawlDatum> target : targets) { + try { + String toHost = new URL(target.getKey().toString()).getHost(); + String fromHost = new URL(fromUrl.toString()).getHost(); + if (toHost.equalsIgnoreCase(fromHost)) { + target.getValue().setScore(internalScore); + } else { + target.getValue().setScore(externalScore); + } + } catch (MalformedURLException e) { + LOG.error("Error: ", e); + target.getValue().setScore(externalScore); + } + } + // XXX (ab) no adjustment? I think this is contrary to the algorithm descr. + // XXX in the paper, where page "loses" its score if it's distributed to + // XXX linked pages... + return adjust; + } + + /** Dampen the boost value by scorePower. */ + public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException { + return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java new file mode 100644 index 0000000..26f6cbe --- /dev/null +++ b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Scoring filter implementing a variant of the Online Page Importance Computation + * (OPIC) algorithm. + */ +package org.apache.nutch.scoring.opic; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/build-ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/build-ivy.xml b/nutch-plugins/scoring-similarity/build-ivy.xml new file mode 100644 index 0000000..50fbb96 --- /dev/null +++ b/nutch-plugins/scoring-similarity/build-ivy.xml @@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="scoring-similarity" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> + + <property name="ivy.install.version" value="2.1.0" /> + <condition property="ivy.home" value="${env.IVY_HOME}"> + <isset property="env.IVY_HOME" /> + </condition> + <property name="ivy.home" value="${user.home}/.ant" /> + <property name="ivy.checksums" value="" /> + <property name="ivy.jar.dir" value="${ivy.home}/lib" /> + <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> + + <target name="download-ivy" unless="offline"> + + <mkdir dir="${ivy.jar.dir}"/> + <!-- download Ivy from web site so that it can be used even without any special installation --> + <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" + dest="${ivy.jar.file}" usetimestamp="true"/> + </target> + + <target name="init-ivy" depends="download-ivy"> + <!-- try to load ivy here from ivy home, in case the user has not already dropped + it into ant's lib dir (note that the latter copy will always take precedence). + We will not fail as long as local lib dir exists (it may be empty) and + ivy is in at least one of ant's lib dir or the local lib dir. --> + <path id="ivy.lib.path"> + <fileset dir="${ivy.jar.dir}" includes="*.jar"/> + + </path> + <taskdef resource="org/apache/ivy/ant/antlib.xml" + uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> + </target> + + <target name="deps-jar" depends="init-ivy"> + <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/build.xml b/nutch-plugins/scoring-similarity/build.xml new file mode 100644 index 0000000..66ac8f3 --- /dev/null +++ b/nutch-plugins/scoring-similarity/build.xml @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="scoring-similarity" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/ivy.xml b/nutch-plugins/scoring-similarity/ivy.xml new file mode 100644 index 0000000..be0a1de --- /dev/null +++ b/nutch-plugins/scoring-similarity/ivy.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" conf="*->default"/> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/plugin.xml b/nutch-plugins/scoring-similarity/plugin.xml new file mode 100644 index 0000000..9639c18 --- /dev/null +++ b/nutch-plugins/scoring-similarity/plugin.xml @@ -0,0 +1,45 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="scoring-similarity" + name="Similarity based Scoring Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="scoring-similarity.jar"> + <export name="*"/> + </library> + <library name="lucene-analyzers-common-5.5.0.jar"/> + <library name="lucene-core-5.5.0.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.scoring.similarity" + name="SimilarityScoring" + point="org.apache.nutch.scoring.ScoringFilter"> + + <implementation id="scoring-similarity" + class="org.apache.nutch.scoring.similarity.SimilarityScoringFilter" /> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/pom.xml b/nutch-plugins/scoring-similarity/pom.xml new file mode 100644 index 0000000..b1f7cb7 --- /dev/null +++ b/nutch-plugins/scoring-similarity/pom.xml @@ -0,0 +1,45 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>scoring-similarity</artifactId> + <packaging>jar</packaging> + + <name>scoring-similarity</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + <dependencies> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + <version>5.5.0</version> + </dependency> + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java new file mode 100644 index 0000000..f44fabd --- /dev/null +++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.scoring.similarity; + +import java.util.Collection; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.protocol.Content; + +public interface SimilarityModel { + + public void setConf(Configuration conf); + + public float setURLScoreAfterParsing(Text url, Content content, Parse parse); + + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount); +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java new file mode 100644 index 0000000..0f905b8 --- /dev/null +++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java @@ -0,0 +1,70 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.scoring.similarity; + +import java.util.Collection; +import java.util.List; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.scoring.AbstractScoringFilter; +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.scoring.similarity.cosine.CosineSimilarity; + +public class SimilarityScoringFilter extends AbstractScoringFilter { + + private Configuration conf; + private SimilarityModel similarityModel; + @Override + public Configuration getConf() { + return conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + switch(conf.get("scoring.similarity.model","cosine")){ + case "cosine": + similarityModel = (SimilarityModel) new CosineSimilarity(); + break; + } + similarityModel.setConf(conf); + } + + @Override + public void passScoreAfterParsing(Text url, Content content, Parse parse) + throws ScoringFilterException { + + float score = similarityModel.setURLScoreAfterParsing(url, content, parse); + parse.getData().getContentMeta() + .set(Nutch.SCORE_KEY, score+""); + } + + @Override + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { + similarityModel.distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount); + return adjust; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java new file mode 100644 index 0000000..9853b34 --- /dev/null +++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java @@ -0,0 +1,84 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.scoring.similarity.cosine; + +import java.io.IOException; +import java.util.Collection; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.scoring.similarity.SimilarityModel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class CosineSimilarity implements SimilarityModel{ + + private Configuration conf; + private final static Logger LOG = LoggerFactory + .getLogger(CosineSimilarity.class); + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + public float setURLScoreAfterParsing(Text url, Content content, Parse parse) { + float score = 1; + + try { + if(!Model.isModelCreated){ + Model.createModel(conf); + } + String metatags = parse.getData().getParseMeta().get("metatag.keyword"); + String metaDescription = parse.getData().getParseMeta().get("metatag.description"); + int[] ngramArr = Model.retrieveNgrams(conf); + int mingram = ngramArr[0]; + int maxgram = ngramArr[1]; + DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, mingram, maxgram); + if(docVector!=null){ + score = Model.computeCosineSimilarity(docVector); + LOG.info("Setting score of {} to {}",url, score); + } + else { + throw new Exception("Could not create DocVector from parsed text"); + } + } catch (Exception e) { + LOG.error("Error creating Cosine Model, setting scores of urls to 1 : {}", StringUtils.stringifyException(e)); + } + return score; + } + + @Override + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, + Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, + int allCount) { + float score = Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY)); + for (Entry<Text, CrawlDatum> target : targets) { + target.getValue().setScore(score); + } + return adjust; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java new file mode 100644 index 0000000..33c3a23 --- /dev/null +++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.scoring.similarity.cosine; + +import java.util.HashMap; +import java.util.Map; + +public class DocVector { + + public HashMap<Integer, Long> termVector; + public HashMap<String, Integer> termFreqVector; + + public DocVector() { + termFreqVector = new HashMap<>(); + } + + public void setTermFreqVector(HashMap<String, Integer> termFreqVector) { + this.termFreqVector = termFreqVector; + } + + public void setVectorEntry(int pos, long freq) { + termVector.put(pos, freq); + } + + public float dotProduct(DocVector docVector) { + float product = 0; + for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) { + if(docVector.termFreqVector.containsKey(entry.getKey())) { + product += docVector.termFreqVector.get(entry.getKey())*entry.getValue(); + } + } + return product; + } + + public float getL2Norm() { + float sum = 0; + for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) { + sum += entry.getValue()*entry.getValue(); + } + return (float) Math.sqrt(sum); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java new file mode 100644 index 0000000..d8180f2 --- /dev/null +++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java @@ -0,0 +1,190 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.scoring.similarity.cosine; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.StringUtils; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType; +import org.apache.nutch.scoring.similarity.util.LuceneTokenizer; +import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType; +import org.apache.tika.Tika; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class creates a model used to store Document vector representation of the corpus. + * + */ +public class Model { + + //Currently only one file, but in future could accept a corpus hence an ArrayList + public static ArrayList<DocVector> docVectors = new ArrayList<>(); + private static final Logger LOG = LoggerFactory.getLogger(Model.class); + public static boolean isModelCreated = false; + private static List<String> stopWords; + + public static synchronized void createModel(Configuration conf) throws IOException { + if(isModelCreated) { + LOG.info("Model exists, skipping model creation"); + return; + } + LOG.info("Creating Cosine model"); + try { + //If user has specified a stopword file other than the template + if(!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) { + stopWords = new ArrayList<String>(); + String stopWord; + BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file")))); + while ((stopWord = br.readLine()) != null) { + stopWords.add(stopWord); + } + LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file")); + } + + int[] ngramArr = retrieveNgrams(conf); + int mingram = ngramArr[0]; + int maxgram = ngramArr[1]; + LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram); + + // TODO : Allow for corpus of documents to be provided as gold standard. + String line; + StringBuilder sb = new StringBuilder(); + BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file")))); + while ((line = br.readLine()) != null) { + sb.append(line); + } + DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram); + if(goldStandard!=null) + docVectors.add(goldStandard); + else { + throw new Exception("Could not create DocVector for goldstandard"); + } + } catch (Exception e) { + LOG.warn("Failed to add {} to model : {}",conf.get("cosine.goldstandard.file","goldstandard.txt.template"), + StringUtils.stringifyException(e)); + } + if(docVectors.size()>0) { + LOG.info("Cosine model creation complete"); + isModelCreated = true; + } + else + LOG.info("Cosine model creation failed"); + } + + /** + * Used to create a DocVector from given String text. Used during the parse stage of the crawl + * cycle to create a DocVector of the currently parsed page from the parseText attribute value + * @param content The text to tokenize + * @param mingram Value of mingram for tokenizing + * @param maxgram Value of maxgram for tokenizing + */ + public static DocVector createDocVector(String content, int mingram, int maxgram) { + LuceneTokenizer tokenizer; + + if(mingram > 1 && maxgram > 1){ + LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); + tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); + } else if (mingram > 1) { + maxgram = mingram; + LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); + tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); + } + else if(stopWords!=null) { + tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, + StemFilterType.PORTERSTEM_FILTER); + } + else { + tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true, + StemFilterType.PORTERSTEM_FILTER); + } + TokenStream tStream = tokenizer.getTokenStream(); + HashMap<String, Integer> termVector = new HashMap<>(); + try { + CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class); + tStream.reset(); + while(tStream.incrementToken()) { + String term = charTermAttribute.toString(); + LOG.debug(term); + if(termVector.containsKey(term)) { + int count = termVector.get(term); + count++; + termVector.put(term, count); + } + else { + termVector.put(term, 1); + } + } + DocVector docVector = new DocVector(); + docVector.setTermFreqVector(termVector); + return docVector; + } catch (IOException e) { + LOG.error("Error creating DocVector : {}",StringUtils.stringifyException(e)); + } + return null; + } + + public static float computeCosineSimilarity(DocVector docVector) { + float scores[] = new float[docVectors.size()]; + int i=0; + float maxScore = 0; + for(DocVector corpusDoc : docVectors) { + float numerator = docVector.dotProduct(corpusDoc); + float denominator = docVector.getL2Norm()*corpusDoc.getL2Norm(); + float currentScore = numerator/denominator; + scores[i++] = currentScore; + maxScore = (currentScore>maxScore)? currentScore : maxScore; + } + // Returning the max score amongst all documents in the corpus + return maxScore; + } + + /** + * Retrieves mingram and maxgram from configuration + * @param conf Configuration to retrieve mingram and maxgram + * @return ngram array as mingram at first index and maxgram at second index + */ + public static int[] retrieveNgrams(Configuration conf){ + int[] ngramArr = new int[2]; + //Check if user has specified mingram or ngram for ngram cosine model + String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1"); + //mingram + ngramArr[0] = Integer.parseInt(ngramStr[0]); + int maxgram; + if (ngramStr.length > 1) { + //maxgram + ngramArr[1] = Integer.parseInt(ngramStr[1]); + } else { + //maxgram + ngramArr[1] = ngramArr[0]; + } + return ngramArr; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java new file mode 100644 index 0000000..70ae4ab --- /dev/null +++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java @@ -0,0 +1,7 @@ +/** + * + */ +/** Implements the cosine similarity metric for scoring relevant documents + * + */ +package org.apache.nutch.scoring.similarity.cosine; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java new file mode 100644 index 0000000..4b519bc --- /dev/null +++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.scoring.similarity.util; + +import java.io.Reader; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.standard.ClassicTokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.util.CharArraySet; + +/** + * Creates a custom analyzer based on user provided inputs + * + */ +public class LuceneAnalyzerUtil extends Analyzer{ + + public static enum StemFilterType { PORTERSTEM_FILTER, ENGLISHMINIMALSTEM_FILTER, NONE } + + private static StemFilterType stemFilterType; + private static CharArraySet stopSet; + + + /** + * Creates an analyzer instance based on Lucene default stopword set if @param useStopFilter is set to true + */ + public LuceneAnalyzerUtil(StemFilterType stemFilterType, boolean useStopFilter) { + LuceneAnalyzerUtil.stemFilterType = stemFilterType; + if(useStopFilter) { + stopSet = StandardAnalyzer.STOP_WORDS_SET; + } + else { + stopSet = null; + } + } + + /** + * Creates an analyzer instance based on user provided stop words. If @param addToDefault is set to true, then + * user provided stop words will be added to the Lucene default stopset. + */ + public LuceneAnalyzerUtil(StemFilterType stemFilterType, List<String> stopWords, boolean addToDefault) { + LuceneAnalyzerUtil.stemFilterType = stemFilterType; + if(addToDefault) { + stopSet.addAll(stopWords); + } + else { + stopSet = StopFilter.makeStopSet(stopWords); + } + } + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer source = new ClassicTokenizer(); + TokenStream filter = new LowerCaseFilter(source); + if(stopSet != null) { + filter = new StopFilter(filter, stopSet); + } + + switch(stemFilterType){ + case PORTERSTEM_FILTER: + filter = new PorterStemFilter(filter); + break; + case ENGLISHMINIMALSTEM_FILTER: + filter = new EnglishMinimalStemFilter(filter); + break; + default: + break; + } + return new TokenStreamComponents(source, filter); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java new file mode 100644 index 0000000..acb987c --- /dev/null +++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java @@ -0,0 +1,166 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.scoring.similarity.util; + +import java.io.StringReader; +import java.util.List; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.standard.ClassicTokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType; + +public class LuceneTokenizer { + + private TokenStream tokenStream; + private TokenizerType tokenizer; + private StemFilterType stemFilterType; + private CharArraySet stopSet = null; + + public static enum TokenizerType {CLASSIC, STANDARD} + + /** + * Creates a tokenizer based on param values + * @param content - The text to tokenize + * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT + * @param useStopFilter - if set to true the token stream will be filtered using default Lucene stopset + * @param stemFilterType - Type of stemming to perform + */ + public LuceneTokenizer(String content, TokenizerType tokenizer, boolean useStopFilter, StemFilterType stemFilterType) { + this.tokenizer = tokenizer; + this.stemFilterType = stemFilterType; + if(useStopFilter) { + stopSet = StandardAnalyzer.STOP_WORDS_SET; + } + tokenStream = createTokenStream(content); + } + + /** + * Creates a tokenizer based on param values + * @param content - The text to tokenize + * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT + * @param stopSet - Provide a set of user defined stop words + * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set. + * If false, then only the user provided words will be used as the stop set + * @param stemFilterType + */ + public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault, StemFilterType stemFilterType) { + this.tokenizer = tokenizer; + this.stemFilterType = stemFilterType; + if(addToDefault) { + CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);; + for(String word : stopWords){ + stopSet.add(word); + } + this.stopSet = stopSet; + } + else { + stopSet = new CharArraySet(stopWords, true); + } + tokenStream = createTokenStream(content); + } + + /** + * Returns the tokenStream created by the Tokenizer + * @return + */ + public TokenStream getTokenStream() { + return tokenStream; + } + + /** + * Creates a tokenizer for the ngram model based on param values + * @param content - The text to tokenize + * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT + * @param stemFilterType - Type of stemming to perform + * @param mingram - Value of mingram for tokenizing + * @param maxgram - Value of maxgram for tokenizing + */ + public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int mingram, int maxgram) { + this.tokenizer = tokenizer; + this.stemFilterType = stemFilterType; + tokenStream = createNGramTokenStream(content, mingram, maxgram); + } + + private TokenStream createTokenStream(String content) { + tokenStream = generateTokenStreamFromText(content, tokenizer); + tokenStream = new LowerCaseFilter(tokenStream); + if(stopSet != null) { + tokenStream = applyStopFilter(stopSet); + } + tokenStream = applyStemmer(stemFilterType); + return tokenStream; + } + + private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType){ + Tokenizer tokenizer = null; + switch(tokenizerType){ + case CLASSIC: + tokenizer = new ClassicTokenizer(); + break; + + case STANDARD: + default: + tokenizer = new StandardTokenizer(); + } + + tokenizer.setReader(new StringReader(content)); + + tokenStream = tokenizer; + + return tokenStream; + } + + private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) { + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader(content)); + tokenStream = new LowerCaseFilter(tokenizer); + tokenStream = applyStemmer(stemFilterType); + ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram); + shingleFilter.setOutputUnigrams(false); + tokenStream = (TokenStream)shingleFilter; + return tokenStream; + } + + private TokenStream applyStopFilter(CharArraySet stopWords) { + tokenStream = new StopFilter(tokenStream, stopWords); + return tokenStream; + } + + private TokenStream applyStemmer(StemFilterType stemFilterType) { + switch(stemFilterType){ + case ENGLISHMINIMALSTEM_FILTER: + tokenStream = new EnglishMinimalStemFilter(tokenStream); + break; + case PORTERSTEM_FILTER: + tokenStream = new PorterStemFilter(tokenStream); + break; + default: + break; + } + + return tokenStream; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java new file mode 100644 index 0000000..f660977 --- /dev/null +++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java @@ -0,0 +1,24 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * + */ +/** + * Utility package for Lucene functions + * + */ +package org.apache.nutch.scoring.similarity.util; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/README.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/README.txt b/nutch-plugins/subcollection/README.txt new file mode 100644 index 0000000..6b79d16 --- /dev/null +++ b/nutch-plugins/subcollection/README.txt @@ -0,0 +1,10 @@ +For brief description about this plugin see +src/java/org/apache/nutch/collection/package.html + +Basically: +You need to enable this during indexing and during searching + +After indexing you can limit your searches to certain +subcollection with keyword subcollection, eg. + +"subcollection:nutch hadoop" http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/build.xml b/nutch-plugins/subcollection/build.xml new file mode 100644 index 0000000..77beac6 --- /dev/null +++ b/nutch-plugins/subcollection/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="subcollection" default="jar"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/ivy.xml b/nutch-plugins/subcollection/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/subcollection/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/plugin.xml b/nutch-plugins/subcollection/plugin.xml new file mode 100644 index 0000000..ca2cf2f --- /dev/null +++ b/nutch-plugins/subcollection/plugin.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="subcollection" + name="Subcollection indexing and query filter" + version="1.0.0" + provider-name="apache.org"> + + <runtime> + <library name="subcollection.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.indexer.subcollection.indexing" + name="Subcollection Indexing Filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="SubcollectionIndexingFilter" + class="org.apache.nutch.indexer.subcollection.SubcollectionIndexingFilter"/> + + </extension> +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/pom.xml b/nutch-plugins/subcollection/pom.xml new file mode 100644 index 0000000..d8e3a97 --- /dev/null +++ b/nutch-plugins/subcollection/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>subcollection</artifactId> + <packaging>jar</packaging> + + <name>subcollection</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java new file mode 100644 index 0000000..0dff3f8 --- /dev/null +++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.collection; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.nutch.util.DomUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.ObjectCache; +import org.apache.xerces.dom.DocumentImpl; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; + +public class CollectionManager extends Configured { + + public static final String DEFAULT_FILE_NAME = "subcollections.xml"; + + static final Logger LOG = LoggerFactory.getLogger(CollectionManager.class); + + transient Map<String, Subcollection> collectionMap = new HashMap<String, Subcollection>(); + + transient URL configfile; + + public CollectionManager(Configuration conf) { + super(conf); + init(); + } + + /** + * Used for testing + */ + protected CollectionManager() { + super(NutchConfiguration.create()); + } + + protected void init() { + try { + if (LOG.isInfoEnabled()) { + LOG.info("initializing CollectionManager"); + } + // initialize known subcollections + configfile = getConf().getResource( + getConf().get("subcollections.config", DEFAULT_FILE_NAME)); + + InputStream input = getConf().getConfResourceAsInputStream( + getConf().get("subcollections.config", DEFAULT_FILE_NAME)); + parse(input); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Error occured:" + e); + } + } + } + + protected void parse(InputStream input) { + Element collections = DomUtil.getDom(input); + + if (collections != null) { + NodeList nodeList = collections + .getElementsByTagName(Subcollection.TAG_COLLECTION); + + if (LOG.isInfoEnabled()) { + LOG.info("file has " + nodeList.getLength() + " elements"); + } + + for (int i = 0; i < nodeList.getLength(); i++) { + Element scElem = (Element) nodeList.item(i); + Subcollection subCol = new Subcollection(getConf()); + subCol.initialize(scElem); + collectionMap.put(subCol.name, subCol); + } + } else if (LOG.isInfoEnabled()) { + LOG.info("Cannot find collections"); + } + } + + public static CollectionManager getCollectionManager(Configuration conf) { + String key = "collectionmanager"; + ObjectCache objectCache = ObjectCache.get(conf); + CollectionManager impl = (CollectionManager) objectCache.getObject(key); + if (impl == null) { + try { + if (LOG.isInfoEnabled()) { + LOG.info("Instantiating CollectionManager"); + } + impl = new CollectionManager(conf); + objectCache.setObject(key, impl); + } catch (Exception e) { + throw new RuntimeException("Couldn't create CollectionManager", e); + } + } + return impl; + } + + /** + * Returns named subcollection + * + * @param id + * @return Named SubCollection (or null if not existing) + */ + public Subcollection getSubColection(final String id) { + return (Subcollection) collectionMap.get(id); + } + + /** + * Delete named subcollection + * + * @param id + * Id of SubCollection to delete + */ + public void deleteSubCollection(final String id) throws IOException { + final Subcollection subCol = getSubColection(id); + if (subCol != null) { + collectionMap.remove(id); + } + } + + /** + * Create a new subcollection. + * + * @param name + * Name of SubCollection to create + * @return Created SubCollection or null if allready existed + */ + public Subcollection createSubCollection(final String id, final String name) { + Subcollection subCol = null; + + if (!collectionMap.containsKey(id)) { + subCol = new Subcollection(id, name, getConf()); + collectionMap.put(id, subCol); + } + + return subCol; + } + + /** + * Return names of collections url is part of + * + * @param url + * The url to test against Collections + * @return Subcollections + */ + public List<Subcollection> getSubCollections(final String url) { + List<Subcollection> collections = new ArrayList<Subcollection>(); + final Iterator iterator = collectionMap.values().iterator(); + + while (iterator.hasNext()) { + final Subcollection subCol = (Subcollection) iterator.next(); + if (subCol.filter(url) != null) { + collections.add(subCol); + } + } + if (LOG.isTraceEnabled()) { + LOG.trace("subcollections:" + Arrays.toString(collections.toArray())); + } + + return collections; + } + + /** + * Returns all collections + * + * @return All collections CollectionManager knows about + */ + public Collection getAll() { + return collectionMap.values(); + } + + /** + * Save collections into file + * + * @throws Exception + */ + public void save() throws IOException { + try { + final FileOutputStream fos = new FileOutputStream(new File( + configfile.getFile())); + final Document doc = new DocumentImpl(); + final Element collections = doc + .createElement(Subcollection.TAG_COLLECTIONS); + final Iterator iterator = collectionMap.values().iterator(); + + while (iterator.hasNext()) { + final Subcollection subCol = (Subcollection) iterator.next(); + final Element collection = doc + .createElement(Subcollection.TAG_COLLECTION); + collections.appendChild(collection); + final Element name = doc.createElement(Subcollection.TAG_NAME); + name.setNodeValue(subCol.getName()); + collection.appendChild(name); + final Element whiteList = doc + .createElement(Subcollection.TAG_WHITELIST); + whiteList.setNodeValue(subCol.getWhiteListString()); + collection.appendChild(whiteList); + final Element blackList = doc + .createElement(Subcollection.TAG_BLACKLIST); + blackList.setNodeValue(subCol.getBlackListString()); + collection.appendChild(blackList); + } + + DomUtil.saveDom(fos, collections); + fos.flush(); + fos.close(); + } catch (FileNotFoundException e) { + throw new IOException(e.toString()); + } + } +}
