http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/build-ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-selenium/build-ivy.xml b/src/plugin/protocol-selenium/build-ivy.xml deleted file mode 100644 index 67d39cd..0000000 --- a/src/plugin/protocol-selenium/build-ivy.xml +++ /dev/null @@ -1,54 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="protocol-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> - - <property name="ivy.install.version" value="2.1.0" /> - <condition property="ivy.home" value="${env.IVY_HOME}"> - <isset property="env.IVY_HOME" /> - </condition> - <property name="ivy.home" value="${user.home}/.ant" /> - <property name="ivy.checksums" value="" /> - <property name="ivy.jar.dir" value="${ivy.home}/lib" /> - <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> - - <target name="download-ivy" unless="offline"> - - <mkdir dir="${ivy.jar.dir}"/> - <!-- download Ivy from web site so that it can be used even without any special installation --> - <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" - dest="${ivy.jar.file}" usetimestamp="true"/> - </target> - - <target name="init-ivy" depends="download-ivy"> - <!-- try to load ivy here from ivy home, in case the user has not already dropped - it into ant's lib dir (note that the latter copy will always take precedence). - We will not fail as long as local lib dir exists (it may be empty) and - ivy is in at least one of ant's lib dir or the local lib dir. --> - <path id="ivy.lib.path"> - <fileset dir="${ivy.jar.dir}" includes="*.jar"/> - - </path> - <taskdef resource="org/apache/ivy/ant/antlib.xml" - uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> - </target> - - <target name="deps-jar" depends="init-ivy"> - <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/> - </target> - -</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-selenium/build.xml b/src/plugin/protocol-selenium/build.xml deleted file mode 100644 index 055018f..0000000 --- a/src/plugin/protocol-selenium/build.xml +++ /dev/null @@ -1,36 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="protocol-selenium" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Build compilation dependencies --> - <target name="deps-jar"> - <ant target="jar" inheritall="false" dir="../lib-http"/> - <ant target="jar" inheritall="false" dir="../lib-selenium"/> - </target> - - <!-- Add compilation dependencies to classpath --> - <path id="plugin.deps"> - <fileset dir="${nutch.root}/build"> - <include name="**/lib-http/*.jar" /> - <include name="**/lib-selenium/*.jar" /> - </fileset> - </path> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-selenium/ivy.xml b/src/plugin/protocol-selenium/ivy.xml deleted file mode 100644 index ff07f8c..0000000 --- a/src/plugin/protocol-selenium/ivy.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../../ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="default"/> - </publications> - - <dependencies> - <!-- Note: only dependencies which are not contained in lib-selenium have to be listed here! --> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-selenium/plugin.xml b/src/plugin/protocol-selenium/plugin.xml deleted file mode 100644 index 1454c1b..0000000 --- a/src/plugin/protocol-selenium/plugin.xml +++ /dev/null @@ -1,47 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="protocol-selenium" - name="Http Protocol Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="protocol-selenium.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - <import plugin="lib-http"/> - <import plugin="lib-selenium"/> - </requires> - - <extension id="org.apache.nutch.protocol.selenium" - name="HttpProtocol" - point="org.apache.nutch.protocol.Protocol"> - - <implementation id="org.apache.nutch.protocol.selenium.Http" - class="org.apache.nutch.protocol.selenium.Http"> - <parameter name="protocolName" value="http"/> - </implementation> - - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java deleted file mode 100644 index 7726bdf..0000000 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.selenium; - -// JDK imports -import java.io.IOException; -import java.net.URL; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.http.api.HttpBase; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.util.NutchConfiguration; - -import org.apache.nutch.protocol.selenium.HttpResponse; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class Http extends HttpBase { - - public static final Logger LOG = LoggerFactory.getLogger(Http.class); - - public Http() { - super(LOG); - } - - @Override - public void setConf(Configuration conf) { - super.setConf(conf); - } - - public static void main(String[] args) throws Exception { - Http http = new Http(); - http.setConf(NutchConfiguration.create()); - main(http, args); - } - - @Override - protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) - throws ProtocolException, IOException { - return new HttpResponse(this, url, datum); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java deleted file mode 100644 index 681e838..0000000 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ /dev/null @@ -1,360 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.selenium; - -// JDK imports -import java.io.BufferedInputStream; -import java.io.EOFException; -import java.io.IOException; -import java.io.OutputStream; -import java.io.ByteArrayOutputStream; -import java.io.PushbackInputStream; -import java.net.InetSocketAddress; -import java.net.Socket; -import java.net.URL; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.metadata.SpellCheckedMetadata; -import org.apache.nutch.net.protocols.HttpDateFormat; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.http.api.HttpException; -import org.apache.nutch.protocol.http.api.HttpBase; - -/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */ - -public class HttpResponse implements Response { - - private Http http; - private URL url; - private String orig; - private String base; - private byte[] content; - private int code; - private Metadata headers = new SpellCheckedMetadata(); - - /** The nutch configuration */ - private Configuration conf = null; - - public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException { - - this.conf = http.getConf(); - this.http = http; - this.url = url; - this.orig = url.toString(); - this.base = url.toString(); - - if (!"http".equals(url.getProtocol())) - throw new HttpException("Not an HTTP url:" + url); - - if (Http.LOG.isTraceEnabled()) { - Http.LOG.trace("fetching " + url); - } - - String path = "".equals(url.getFile()) ? "/" : url.getFile(); - - // some servers will redirect a request with a host line like - // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they - // don't want the :80... - - String host = url.getHost(); - int port; - String portString; - if (url.getPort() == -1) { - port = 80; - portString = ""; - } else { - port = url.getPort(); - portString = ":" + port; - } - Socket socket = null; - - try { - socket = new Socket(); // create the socket - socket.setSoTimeout(http.getTimeout()); - - // connect - String sockHost = http.useProxy(url) ? http.getProxyHost() : host; - int sockPort = http.useProxy(url) ? http.getProxyPort() : port; - InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); - socket.connect(sockAddr, http.getTimeout()); - - // make request - OutputStream req = socket.getOutputStream(); - - StringBuffer reqStr = new StringBuffer("GET "); - if (http.useProxy(url)) { - reqStr.append(url.getProtocol() + "://" + host + portString + path); - } else { - reqStr.append(path); - } - - reqStr.append(" HTTP/1.0\r\n"); - - reqStr.append("Host: "); - reqStr.append(host); - reqStr.append(portString); - reqStr.append("\r\n"); - - reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n"); - - String userAgent = http.getUserAgent(); - if ((userAgent == null) || (userAgent.length() == 0)) { - if (Http.LOG.isErrorEnabled()) { - Http.LOG.error("User-agent is not set!"); - } - } else { - reqStr.append("User-Agent: "); - reqStr.append(userAgent); - reqStr.append("\r\n"); - } - - reqStr.append("Accept-Language: "); - reqStr.append(this.http.getAcceptLanguage()); - reqStr.append("\r\n"); - - reqStr.append("Accept: "); - reqStr.append(this.http.getAccept()); - reqStr.append("\r\n"); - - if (datum.getModifiedTime() > 0) { - reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime())); - reqStr.append("\r\n"); - } - reqStr.append("\r\n"); - - byte[] reqBytes = reqStr.toString().getBytes(); - - req.write(reqBytes); - req.flush(); - - PushbackInputStream in = // process response - new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE), - Http.BUFFER_SIZE); - - StringBuffer line = new StringBuffer(); - - boolean haveSeenNonContinueStatus = false; - while (!haveSeenNonContinueStatus) { - // parse status code line - this.code = parseStatusLine(in, line); - // parse headers - parseHeaders(in, line); - haveSeenNonContinueStatus = code != 100; // 100 is "Continue" - } - - // Get Content type header - String contentType = getHeader(Response.CONTENT_TYPE); - - // handle with Selenium only if content type in HTML or XHTML - if (contentType != null) { - if (contentType.contains("text/html") || contentType.contains("application/xhtml")) { - readPlainContent(url); - } else { - try { - int contentLength = Integer.MAX_VALUE; - String contentLengthString = headers.get(Response.CONTENT_LENGTH); - if (contentLengthString != null) { - try { - contentLength = Integer.parseInt(contentLengthString.trim()); - } catch (NumberFormatException ex) { - throw new HttpException("bad content length: " + contentLengthString); - } - } - - if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) { - contentLength = http.getMaxContent(); - } - - byte[] buffer = new byte[HttpBase.BUFFER_SIZE]; - int bufferFilled = 0; - int totalRead = 0; - ByteArrayOutputStream out = new ByteArrayOutputStream(); - while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 - && totalRead + bufferFilled <= contentLength) { - totalRead += bufferFilled; - out.write(buffer, 0, bufferFilled); - } - - content = out.toByteArray(); - - } catch (Exception e) { - if (code == 200) - throw new IOException(e.toString()); - // for codes other than 200 OK, we are fine with empty content - } finally { - if (in != null) { - in.close(); - } - } - } - } - - } finally { - if (socket != null) - socket.close(); - } - } - - /* ------------------------- * - * <implementation:Response> * - * ------------------------- */ - - public URL getUrl() { - return url; - } - - public int getCode() { - return code; - } - - public String getHeader(String name) { - return headers.get(name); - } - - public Metadata getHeaders() { - return headers; - } - - public byte[] getContent() { - return content; - } - - /* ------------------------- * - * <implementation:Response> * - * ------------------------- */ - - private void readPlainContent(URL url) throws IOException { - String page = HttpWebClient.getHtmlPage(url.toString(), conf); - - content = page.getBytes("UTF-8"); - } - - private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { - readLine(in, line, false); - - int codeStart = line.indexOf(" "); - int codeEnd = line.indexOf(" ", codeStart + 1); - - // handle lines with no plaintext result code, ie: - // "HTTP/1.1 200" vs "HTTP/1.1 200 OK" - if (codeEnd == -1) - codeEnd = line.length(); - - int code; - try { - code = Integer.parseInt(line.substring(codeStart + 1, codeEnd)); - } catch (NumberFormatException e) { - throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e); - } - - return code; - } - - private void processHeaderLine(StringBuffer line) throws IOException, HttpException { - - int colonIndex = line.indexOf(":"); // key is up to colon - if (colonIndex == -1) { - int i; - for (i = 0; i < line.length(); i++) - if (!Character.isWhitespace(line.charAt(i))) - break; - if (i == line.length()) - return; - throw new HttpException("No colon in header:" + line); - } - String key = line.substring(0, colonIndex); - - int valueStart = colonIndex + 1; // skip whitespace - while (valueStart < line.length()) { - int c = line.charAt(valueStart); - if (c != ' ' && c != '\t') - break; - valueStart++; - } - String value = line.substring(valueStart); - headers.set(key, value); - } - - // Adds headers to our headers Metadata - private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { - - while (readLine(in, line, true) != 0) { - - // handle HTTP responses with missing blank line after headers - int pos; - if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1) - || ((pos = line.indexOf("<html")) != -1)) { - - in.unread(line.substring(pos).getBytes("UTF-8")); - line.setLength(pos); - - try { - //TODO: (CM) We don't know the header names here - //since we're just handling them generically. It would - //be nice to provide some sort of mapping function here - //for the returned header names to the standard metadata - //names in the ParseData class - processHeaderLine(line); - } catch (Exception e) { - // fixme: - Http.LOG.warn("Error: ", e); - } - return; - } - - processHeaderLine(line); - } - } - - private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine) - throws IOException { - line.setLength(0); - for (int c = in.read(); c != -1; c = in.read()) { - switch (c) { - case '\r': - if (peek(in) == '\n') { - in.read(); - } - case '\n': - if (line.length() > 0) { - // at EOL -- check for continued line if the current - // (possibly continued) line wasn't blank - if (allowContinuedLine) - switch (peek(in)) { - case ' ': - case '\t': // line is continued - in.read(); - continue; - } - } - return line.length(); // else complete - default: - line.append((char) c); - } - } - throw new EOFException(); - } - - private static int peek(PushbackInputStream in) throws IOException { - int value = in.read(); - in.unread(value); - return value; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html deleted file mode 100644 index 75cd5b5..0000000 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-depth/build.xml b/src/plugin/scoring-depth/build.xml deleted file mode 100644 index 6c041ed..0000000 --- a/src/plugin/scoring-depth/build.xml +++ /dev/null @@ -1,6 +0,0 @@ -<?xml version="1.0"?> -<project name="scoring-depth" default="jar-core"> - - <import file="../build-plugin.xml"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-depth/ivy.xml b/src/plugin/scoring-depth/ivy.xml deleted file mode 100644 index 24d7606..0000000 --- a/src/plugin/scoring-depth/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../../ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-depth/plugin.xml b/src/plugin/scoring-depth/plugin.xml deleted file mode 100644 index ea57dc6..0000000 --- a/src/plugin/scoring-depth/plugin.xml +++ /dev/null @@ -1,24 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<plugin - id="scoring-depth" - name="Scoring plugin for depth-limited crawling." - version="1.0.0" - provider-name="ant.com"> - - <runtime> - <library name="scoring-depth.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.scoring.depth" - name="Depth Scoring Filter" - point="org.apache.nutch.scoring.ScoringFilter"> - <implementation id="DepthScoringFilter" - class="org.apache.nutch.scoring.depth.DepthScoringFilter"/> - </extension> -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java deleted file mode 100644 index 0a0dd27..0000000 --- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java +++ /dev/null @@ -1,207 +0,0 @@ -package org.apache.nutch.scoring.depth; - -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import java.util.Map.Entry; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.scoring.ScoringFilter; -import org.apache.nutch.scoring.ScoringFilterException; - -/** - * This scoring filter limits the number of hops from the initial seed urls. If - * the number of hops exceeds the depth (either the default value, or the one - * set in the injector file) then all outlinks from that url are discarded, - * effectively stopping further crawling along this path. - */ -public class DepthScoringFilter extends Configured implements ScoringFilter { - private static final Log LOG = LogFactory.getLog(DepthScoringFilter.class); - - public static final String DEPTH_KEY = "_depth_"; - public static final Text DEPTH_KEY_W = new Text(DEPTH_KEY); - public static final String MAX_DEPTH_KEY = "_maxdepth_"; - public static final Text MAX_DEPTH_KEY_W = new Text(MAX_DEPTH_KEY); - - // maximum value that we are never likely to reach - // because the depth of the Web graph is that high only - // for spam cliques. - public static final int DEFAULT_MAX_DEPTH = 1000; - - private int defaultMaxDepth; - - @Override - public void setConf(Configuration conf) { - super.setConf(conf); - if (conf == null) - return; - defaultMaxDepth = conf.getInt("scoring.depth.max", DEFAULT_MAX_DEPTH); - if (defaultMaxDepth <= 0) { - defaultMaxDepth = DEFAULT_MAX_DEPTH; - } - } - - @Override - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, - ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, - CrawlDatum adjust, int allCount) throws ScoringFilterException { - String depthString = parseData.getMeta(DEPTH_KEY); - if (depthString == null) { - LOG.warn("Missing depth, removing all outlinks from url " + fromUrl); - targets.clear(); - return adjust; - } - int curDepth = Integer.parseInt(depthString); - int curMaxDepth = defaultMaxDepth; - IntWritable customMaxDepth = null; - // allow overrides from injector - String maxDepthString = parseData.getMeta(MAX_DEPTH_KEY); - if (maxDepthString != null) { - curMaxDepth = Integer.parseInt(maxDepthString); - customMaxDepth = new IntWritable(curMaxDepth); - } - if (curDepth >= curMaxDepth) { - // depth exceeded - throw away - LOG.info("Depth limit (" + curMaxDepth - + ") reached, ignoring outlinks for " + fromUrl); - targets.clear(); - return adjust; - } - Iterator<Entry<Text, CrawlDatum>> it = targets.iterator(); - while (it.hasNext()) { - Entry<Text, CrawlDatum> e = it.next(); - // record increased depth - e.getValue().getMetaData() - .put(DEPTH_KEY_W, new IntWritable(curDepth + 1)); - // record maxDepth if any - if (customMaxDepth != null) { - e.getValue().getMetaData().put(MAX_DEPTH_KEY_W, customMaxDepth); - } - } - return adjust; - } - - // prioritize by smaller values of depth - @Override - public float generatorSortValue(Text url, CrawlDatum datum, float initSort) - throws ScoringFilterException { - // boost up by current depth - int curDepth, curMaxDepth; - IntWritable maxDepth = (IntWritable) datum.getMetaData().get( - MAX_DEPTH_KEY_W); - if (maxDepth != null) { - curMaxDepth = maxDepth.get(); - } else { - curMaxDepth = defaultMaxDepth; - } - IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W); - if (depth == null) { - // penalize - curDepth = curMaxDepth; - } else { - curDepth = depth.get(); - } - int mul = curMaxDepth - curDepth; - return initSort * (1 + mul); - } - - public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, - CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) - throws ScoringFilterException { - return initScore; - } - - @Override - public void initialScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - // the datum might already have some values set - // e.g. obtained from redirection - // in which case we don't want to override them - if (datum.getMetaData().get(MAX_DEPTH_KEY_W) == null) - datum.getMetaData() - .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth)); - // initial depth is 1 - if (datum.getMetaData().get(DEPTH_KEY_W) == null) - datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1)); - } - - @Override - public void injectedScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - - // check for the presence of the depth limit key - if (datum.getMetaData().get(MAX_DEPTH_KEY_W) != null) { - // convert from Text to Int - String depthString = datum.getMetaData().get(MAX_DEPTH_KEY_W).toString(); - datum.getMetaData().remove(MAX_DEPTH_KEY_W); - int depth = Integer.parseInt(depthString); - datum.getMetaData().put(MAX_DEPTH_KEY_W, new IntWritable(depth)); - } else { // put the default - datum.getMetaData() - .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth)); - } - // initial depth is 1 - datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1)); - } - - @Override - public void passScoreAfterParsing(Text url, Content content, Parse parse) - throws ScoringFilterException { - String depth = content.getMetadata().get(DEPTH_KEY); - if (depth != null) { - parse.getData().getParseMeta().set(DEPTH_KEY, depth); - } - String maxdepth = content.getMetadata().get(MAX_DEPTH_KEY); - if (maxdepth != null) { - parse.getData().getParseMeta().set(MAX_DEPTH_KEY, maxdepth); - } - } - - @Override - public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) - throws ScoringFilterException { - IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W); - if (depth != null) { - content.getMetadata().set(DEPTH_KEY, depth.toString()); - } - IntWritable maxdepth = (IntWritable) datum.getMetaData().get( - MAX_DEPTH_KEY_W); - if (maxdepth != null) { - content.getMetadata().set(MAX_DEPTH_KEY, maxdepth.toString()); - } - } - - @Override - public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, - List<CrawlDatum> inlinked) throws ScoringFilterException { - // find a minimum of all depths - int newDepth = DEFAULT_MAX_DEPTH; - if (old != null) { - IntWritable oldDepth = (IntWritable) old.getMetaData().get(DEPTH_KEY_W); - if (oldDepth != null) { - newDepth = oldDepth.get(); - } else { - // not set ? - initialScore(url, old); - } - } - for (CrawlDatum lnk : inlinked) { - IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W); - if (depth != null && depth.get() < newDepth) { - newDepth = depth.get(); - } - } - datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(newDepth)); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java deleted file mode 100644 index aa89797..0000000 --- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Scoring filter to stop crawling at a configurable depth - * (number of "hops" from seed URLs). - */ -package org.apache.nutch.scoring.depth; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-link/build.xml b/src/plugin/scoring-link/build.xml deleted file mode 100644 index 123b1ea..0000000 --- a/src/plugin/scoring-link/build.xml +++ /dev/null @@ -1,27 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="scoring-link" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-link/ivy.xml b/src/plugin/scoring-link/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/scoring-link/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-link/plugin.xml b/src/plugin/scoring-link/plugin.xml deleted file mode 100644 index 2b1c1e1..0000000 --- a/src/plugin/scoring-link/plugin.xml +++ /dev/null @@ -1,39 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="scoring-link" - name="Link Analysis Scoring Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - - <runtime> - <library name="scoring-link.jar"> - <export name="*"/> - </library> - </runtime> - - <extension id="org.apache.nutch.scoring.link" - name="LinkAnalysisScoring" - point="org.apache.nutch.scoring.ScoringFilter"> - - <implementation id="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter" - class="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter" /> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java deleted file mode 100644 index a143f46..0000000 --- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.scoring.link; - -import java.util.Collection; -import java.util.List; -import java.util.Map.Entry; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.metadata.Nutch; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.scoring.ScoringFilter; -import org.apache.nutch.scoring.ScoringFilterException; - -public class LinkAnalysisScoringFilter implements ScoringFilter { - - private Configuration conf; - private float normalizedScore = 1.00f; - - public LinkAnalysisScoringFilter() { - - } - - public Configuration getConf() { - return conf; - } - - public void setConf(Configuration conf) { - this.conf = conf; - normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f); - } - - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, - ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, - CrawlDatum adjust, int allCount) throws ScoringFilterException { - return adjust; - } - - public float generatorSortValue(Text url, CrawlDatum datum, float initSort) - throws ScoringFilterException { - return datum.getScore() * initSort; - } - - public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, - CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) - throws ScoringFilterException { - return (normalizedScore * dbDatum.getScore()); - } - - public void initialScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - datum.setScore(0.0f); - } - - public void injectedScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - } - - public void passScoreAfterParsing(Text url, Content content, Parse parse) - throws ScoringFilterException { - parse.getData().getContentMeta() - .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); - } - - public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) - throws ScoringFilterException { - content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore()); - } - - public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, - List<CrawlDatum> inlinked) throws ScoringFilterException { - // nothing to do - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java deleted file mode 100644 index 9dc0c35..0000000 --- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Scoring filter used in conjunction with - * {@link org.apache.nutch.scoring.webgraph.WebGraph}. - */ -package org.apache.nutch.scoring.link; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-opic/build.xml b/src/plugin/scoring-opic/build.xml deleted file mode 100644 index 137dab4..0000000 --- a/src/plugin/scoring-opic/build.xml +++ /dev/null @@ -1,27 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="scoring-opic" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-opic/ivy.xml b/src/plugin/scoring-opic/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/scoring-opic/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-opic/plugin.xml b/src/plugin/scoring-opic/plugin.xml deleted file mode 100644 index 3805a31..0000000 --- a/src/plugin/scoring-opic/plugin.xml +++ /dev/null @@ -1,39 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="scoring-opic" - name="OPIC Scoring Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - - <runtime> - <library name="scoring-opic.jar"> - <export name="*"/> - </library> - </runtime> - - <extension id="org.apache.nutch.scoring.opic" - name="OPICScoring" - point="org.apache.nutch.scoring.ScoringFilter"> - - <implementation id="org.apache.nutch.scoring.opic.OPICScoringFilter" - class="org.apache.nutch.scoring.opic.OPICScoringFilter" /> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java deleted file mode 100644 index e943d06..0000000 --- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java +++ /dev/null @@ -1,173 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.scoring.opic; - -import java.net.MalformedURLException; -import java.net.URL; -import java.util.Collection; -import java.util.List; -import java.util.Map.Entry; - -// Slf4j Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.metadata.Nutch; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.scoring.ScoringFilter; -import org.apache.nutch.scoring.ScoringFilterException; - -/** - * This plugin implements a variant of an Online Page Importance Computation - * (OPIC) score, described in this paper: <a - * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/> - * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive - * On-Line Page Importance Computation </a>. - * - * @author Andrzej Bialecki - */ -public class OPICScoringFilter implements ScoringFilter { - - private final static Logger LOG = LoggerFactory - .getLogger(OPICScoringFilter.class); - - private Configuration conf; - private float scoreInjected; - private float scorePower; - private float internalScoreFactor; - private float externalScoreFactor; - private boolean countFiltered; - - public Configuration getConf() { - return conf; - } - - public void setConf(Configuration conf) { - this.conf = conf; - scorePower = conf.getFloat("indexer.score.power", 0.5f); - internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f); - externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f); - countFiltered = conf.getBoolean("db.score.count.filtered", false); - } - - public void injectedScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - } - - /** - * Set to 0.0f (unknown value) - inlink contributions will bring it to a - * correct level. Newly discovered pages have at least one inlink. - */ - public void initialScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - datum.setScore(0.0f); - } - - /** Use {@link CrawlDatum#getScore()}. */ - public float generatorSortValue(Text url, CrawlDatum datum, float initSort) - throws ScoringFilterException { - return datum.getScore() * initSort; - } - - /** Increase the score by a sum of inlinked scores. */ - public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, - List<CrawlDatum> inlinked) throws ScoringFilterException { - float adjust = 0.0f; - for (int i = 0; i < inlinked.size(); i++) { - CrawlDatum linked = inlinked.get(i); - adjust += linked.getScore(); - } - if (old == null) - old = datum; - datum.setScore(old.getScore() + adjust); - } - - /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */ - public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) { - content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore()); - } - - /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */ - public void passScoreAfterParsing(Text url, Content content, Parse parse) { - parse.getData().getContentMeta() - .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); - } - - /** - * Get a float value from Fetcher.SCORE_KEY, divide it by the number of - * outlinks and apply. - */ - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, - ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, - CrawlDatum adjust, int allCount) throws ScoringFilterException { - float score = scoreInjected; - String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY); - if (scoreString != null) { - try { - score = Float.parseFloat(scoreString); - } catch (Exception e) { - LOG.error("Error: ", e); - } - } - int validCount = targets.size(); - if (countFiltered) { - score /= allCount; - } else { - if (validCount == 0) { - // no outlinks to distribute score, so just return adjust - return adjust; - } - score /= validCount; - } - // internal and external score factor - float internalScore = score * internalScoreFactor; - float externalScore = score * externalScoreFactor; - for (Entry<Text, CrawlDatum> target : targets) { - try { - String toHost = new URL(target.getKey().toString()).getHost(); - String fromHost = new URL(fromUrl.toString()).getHost(); - if (toHost.equalsIgnoreCase(fromHost)) { - target.getValue().setScore(internalScore); - } else { - target.getValue().setScore(externalScore); - } - } catch (MalformedURLException e) { - LOG.error("Error: ", e); - target.getValue().setScore(externalScore); - } - } - // XXX (ab) no adjustment? I think this is contrary to the algorithm descr. - // XXX in the paper, where page "loses" its score if it's distributed to - // XXX linked pages... - return adjust; - } - - /** Dampen the boost value by scorePower. */ - public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, - CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) - throws ScoringFilterException { - return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java deleted file mode 100644 index 26f6cbe..0000000 --- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Scoring filter implementing a variant of the Online Page Importance Computation - * (OPIC) algorithm. - */ -package org.apache.nutch.scoring.opic; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/build-ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/build-ivy.xml b/src/plugin/scoring-similarity/build-ivy.xml deleted file mode 100644 index 50fbb96..0000000 --- a/src/plugin/scoring-similarity/build-ivy.xml +++ /dev/null @@ -1,54 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="scoring-similarity" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> - - <property name="ivy.install.version" value="2.1.0" /> - <condition property="ivy.home" value="${env.IVY_HOME}"> - <isset property="env.IVY_HOME" /> - </condition> - <property name="ivy.home" value="${user.home}/.ant" /> - <property name="ivy.checksums" value="" /> - <property name="ivy.jar.dir" value="${ivy.home}/lib" /> - <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> - - <target name="download-ivy" unless="offline"> - - <mkdir dir="${ivy.jar.dir}"/> - <!-- download Ivy from web site so that it can be used even without any special installation --> - <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" - dest="${ivy.jar.file}" usetimestamp="true"/> - </target> - - <target name="init-ivy" depends="download-ivy"> - <!-- try to load ivy here from ivy home, in case the user has not already dropped - it into ant's lib dir (note that the latter copy will always take precedence). - We will not fail as long as local lib dir exists (it may be empty) and - ivy is in at least one of ant's lib dir or the local lib dir. --> - <path id="ivy.lib.path"> - <fileset dir="${ivy.jar.dir}" includes="*.jar"/> - - </path> - <taskdef resource="org/apache/ivy/ant/antlib.xml" - uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> - </target> - - <target name="deps-jar" depends="init-ivy"> - <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/build.xml b/src/plugin/scoring-similarity/build.xml deleted file mode 100644 index 66ac8f3..0000000 --- a/src/plugin/scoring-similarity/build.xml +++ /dev/null @@ -1,27 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="scoring-similarity" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/ivy.xml b/src/plugin/scoring-similarity/ivy.xml deleted file mode 100644 index be0a1de..0000000 --- a/src/plugin/scoring-similarity/ivy.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" conf="*->default"/> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/plugin.xml b/src/plugin/scoring-similarity/plugin.xml deleted file mode 100644 index 9639c18..0000000 --- a/src/plugin/scoring-similarity/plugin.xml +++ /dev/null @@ -1,45 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="scoring-similarity" - name="Similarity based Scoring Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - - <runtime> - <library name="scoring-similarity.jar"> - <export name="*"/> - </library> - <library name="lucene-analyzers-common-5.5.0.jar"/> - <library name="lucene-core-5.5.0.jar"/> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.scoring.similarity" - name="SimilarityScoring" - point="org.apache.nutch.scoring.ScoringFilter"> - - <implementation id="scoring-similarity" - class="org.apache.nutch.scoring.similarity.SimilarityScoringFilter" /> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java deleted file mode 100644 index f44fabd..0000000 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.scoring.similarity; - -import java.util.Collection; -import java.util.Map.Entry; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.protocol.Content; - -public interface SimilarityModel { - - public void setConf(Configuration conf); - - public float setURLScoreAfterParsing(Text url, Content content, Parse parse); - - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, - ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, - CrawlDatum adjust, int allCount); -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java deleted file mode 100644 index 0f905b8..0000000 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java +++ /dev/null @@ -1,70 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.scoring.similarity; - -import java.util.Collection; -import java.util.List; -import java.util.Map.Entry; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Nutch; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.scoring.AbstractScoringFilter; -import org.apache.nutch.scoring.ScoringFilterException; -import org.apache.nutch.scoring.similarity.cosine.CosineSimilarity; - -public class SimilarityScoringFilter extends AbstractScoringFilter { - - private Configuration conf; - private SimilarityModel similarityModel; - @Override - public Configuration getConf() { - return conf; - } - - @Override - public void setConf(Configuration conf) { - this.conf = conf; - switch(conf.get("scoring.similarity.model","cosine")){ - case "cosine": - similarityModel = (SimilarityModel) new CosineSimilarity(); - break; - } - similarityModel.setConf(conf); - } - - @Override - public void passScoreAfterParsing(Text url, Content content, Parse parse) - throws ScoringFilterException { - - float score = similarityModel.setURLScoreAfterParsing(url, content, parse); - parse.getData().getContentMeta() - .set(Nutch.SCORE_KEY, score+""); - } - - @Override - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, - ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, - CrawlDatum adjust, int allCount) throws ScoringFilterException { - similarityModel.distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount); - return adjust; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java deleted file mode 100644 index 9853b34..0000000 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.scoring.similarity.cosine; - -import java.io.IOException; -import java.util.Collection; -import java.util.Map.Entry; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.util.StringUtils; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Nutch; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.scoring.similarity.SimilarityModel; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class CosineSimilarity implements SimilarityModel{ - - private Configuration conf; - private final static Logger LOG = LoggerFactory - .getLogger(CosineSimilarity.class); - - @Override - public void setConf(Configuration conf) { - this.conf = conf; - } - - @Override - public float setURLScoreAfterParsing(Text url, Content content, Parse parse) { - float score = 1; - - try { - if(!Model.isModelCreated){ - Model.createModel(conf); - } - String metatags = parse.getData().getParseMeta().get("metatag.keyword"); - String metaDescription = parse.getData().getParseMeta().get("metatag.description"); - int[] ngramArr = Model.retrieveNgrams(conf); - int mingram = ngramArr[0]; - int maxgram = ngramArr[1]; - DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, mingram, maxgram); - if(docVector!=null){ - score = Model.computeCosineSimilarity(docVector); - LOG.info("Setting score of {} to {}",url, score); - } - else { - throw new Exception("Could not create DocVector from parsed text"); - } - } catch (Exception e) { - LOG.error("Error creating Cosine Model, setting scores of urls to 1 : {}", StringUtils.stringifyException(e)); - } - return score; - } - - @Override - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, - Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, - int allCount) { - float score = Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY)); - for (Entry<Text, CrawlDatum> target : targets) { - target.getValue().setScore(score); - } - return adjust; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java deleted file mode 100644 index 33c3a23..0000000 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java +++ /dev/null @@ -1,57 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.scoring.similarity.cosine; - -import java.util.HashMap; -import java.util.Map; - -public class DocVector { - - public HashMap<Integer, Long> termVector; - public HashMap<String, Integer> termFreqVector; - - public DocVector() { - termFreqVector = new HashMap<>(); - } - - public void setTermFreqVector(HashMap<String, Integer> termFreqVector) { - this.termFreqVector = termFreqVector; - } - - public void setVectorEntry(int pos, long freq) { - termVector.put(pos, freq); - } - - public float dotProduct(DocVector docVector) { - float product = 0; - for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) { - if(docVector.termFreqVector.containsKey(entry.getKey())) { - product += docVector.termFreqVector.get(entry.getKey())*entry.getValue(); - } - } - return product; - } - - public float getL2Norm() { - float sum = 0; - for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) { - sum += entry.getValue()*entry.getValue(); - } - return (float) Math.sqrt(sum); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java deleted file mode 100644 index d8180f2..0000000 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java +++ /dev/null @@ -1,190 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.scoring.similarity.cosine; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.StringUtils; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType; -import org.apache.nutch.scoring.similarity.util.LuceneTokenizer; -import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType; -import org.apache.tika.Tika; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This class creates a model used to store Document vector representation of the corpus. - * - */ -public class Model { - - //Currently only one file, but in future could accept a corpus hence an ArrayList - public static ArrayList<DocVector> docVectors = new ArrayList<>(); - private static final Logger LOG = LoggerFactory.getLogger(Model.class); - public static boolean isModelCreated = false; - private static List<String> stopWords; - - public static synchronized void createModel(Configuration conf) throws IOException { - if(isModelCreated) { - LOG.info("Model exists, skipping model creation"); - return; - } - LOG.info("Creating Cosine model"); - try { - //If user has specified a stopword file other than the template - if(!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) { - stopWords = new ArrayList<String>(); - String stopWord; - BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file")))); - while ((stopWord = br.readLine()) != null) { - stopWords.add(stopWord); - } - LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file")); - } - - int[] ngramArr = retrieveNgrams(conf); - int mingram = ngramArr[0]; - int maxgram = ngramArr[1]; - LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram); - - // TODO : Allow for corpus of documents to be provided as gold standard. - String line; - StringBuilder sb = new StringBuilder(); - BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file")))); - while ((line = br.readLine()) != null) { - sb.append(line); - } - DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram); - if(goldStandard!=null) - docVectors.add(goldStandard); - else { - throw new Exception("Could not create DocVector for goldstandard"); - } - } catch (Exception e) { - LOG.warn("Failed to add {} to model : {}",conf.get("cosine.goldstandard.file","goldstandard.txt.template"), - StringUtils.stringifyException(e)); - } - if(docVectors.size()>0) { - LOG.info("Cosine model creation complete"); - isModelCreated = true; - } - else - LOG.info("Cosine model creation failed"); - } - - /** - * Used to create a DocVector from given String text. Used during the parse stage of the crawl - * cycle to create a DocVector of the currently parsed page from the parseText attribute value - * @param content The text to tokenize - * @param mingram Value of mingram for tokenizing - * @param maxgram Value of maxgram for tokenizing - */ - public static DocVector createDocVector(String content, int mingram, int maxgram) { - LuceneTokenizer tokenizer; - - if(mingram > 1 && maxgram > 1){ - LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); - tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); - } else if (mingram > 1) { - maxgram = mingram; - LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); - tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); - } - else if(stopWords!=null) { - tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, - StemFilterType.PORTERSTEM_FILTER); - } - else { - tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true, - StemFilterType.PORTERSTEM_FILTER); - } - TokenStream tStream = tokenizer.getTokenStream(); - HashMap<String, Integer> termVector = new HashMap<>(); - try { - CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class); - tStream.reset(); - while(tStream.incrementToken()) { - String term = charTermAttribute.toString(); - LOG.debug(term); - if(termVector.containsKey(term)) { - int count = termVector.get(term); - count++; - termVector.put(term, count); - } - else { - termVector.put(term, 1); - } - } - DocVector docVector = new DocVector(); - docVector.setTermFreqVector(termVector); - return docVector; - } catch (IOException e) { - LOG.error("Error creating DocVector : {}",StringUtils.stringifyException(e)); - } - return null; - } - - public static float computeCosineSimilarity(DocVector docVector) { - float scores[] = new float[docVectors.size()]; - int i=0; - float maxScore = 0; - for(DocVector corpusDoc : docVectors) { - float numerator = docVector.dotProduct(corpusDoc); - float denominator = docVector.getL2Norm()*corpusDoc.getL2Norm(); - float currentScore = numerator/denominator; - scores[i++] = currentScore; - maxScore = (currentScore>maxScore)? currentScore : maxScore; - } - // Returning the max score amongst all documents in the corpus - return maxScore; - } - - /** - * Retrieves mingram and maxgram from configuration - * @param conf Configuration to retrieve mingram and maxgram - * @return ngram array as mingram at first index and maxgram at second index - */ - public static int[] retrieveNgrams(Configuration conf){ - int[] ngramArr = new int[2]; - //Check if user has specified mingram or ngram for ngram cosine model - String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1"); - //mingram - ngramArr[0] = Integer.parseInt(ngramStr[0]); - int maxgram; - if (ngramStr.length > 1) { - //maxgram - ngramArr[1] = Integer.parseInt(ngramStr[1]); - } else { - //maxgram - ngramArr[1] = ngramArr[0]; - } - return ngramArr; - } -} \ No newline at end of file
