svn commit: r365576 - /lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
Author: ab Date: Tue Jan 3 00:35:04 2006 New Revision: 365576 URL: http://svn.apache.org/viewcvs?rev=365576view=rev Log: Fixed an NPE, in case of a fetch error we don't have a score value from Fetcher. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=365576r1=365575r2=365576view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Tue Jan 3 00:35:04 2006 @@ -37,7 +37,8 @@ String name) throws IOException { final float interval = job.getFloat(db.default.fetch.interval, 30f); - +final float extscore = job.getFloat(db.score.link.external, 1.0f); + File text = new File(new File(job.getOutputDir(), ParseText.DIR_NAME), name); File data = @@ -81,8 +82,10 @@ Outlink[] links = parse.getData().getOutlinks(); // compute OPIC score contribution - float score = -Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY)); + String scoreString = parse.getData().get(Fetcher.SCORE_KEY); + float score = extscore; + // this may happen if there was a fetch error. + if (scoreString != null) score = Float.parseFloat(scoreString); score /= links.length; for (int i = 0; i links.length; i++) {
[Nutch Wiki] Trivial Update of FAQ by GalNitzan
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by GalNitzan: http://wiki.apache.org/nutch/FAQ -- To solve this problem, add the following java param to the java instantiation in bin/nutch: JAVA_IPV4=-Djava.net.preferIPv4Stack=true + # run it exec $JAVA $JAVA_HEAP_MAX $NUTCH_OPTS $JAVA_IPV4 -classpath $CLASSPATH $CLASS $@
[Nutch Wiki] Update of FrontPage by ByronMiller
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by ByronMiller: http://wiki.apache.org/nutch/FrontPage -- * [http://wiki.media-style.com/display/nutchDocu/Home Stefan's Nutch Documentation] * [http://frutch.free.fr/wikini/ Frutch Wiki] -- French Nutch Wiki * The [http://nutch.sourceforge.net/cgi-bin/twiki/view/Main/Nutch Old Wiki] + * [Search Theory Papers]
[Nutch Wiki] Update of FrontPage by ByronMiller
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by ByronMiller: http://wiki.apache.org/nutch/FrontPage -- * [http://wiki.media-style.com/display/nutchDocu/Home Stefan's Nutch Documentation] * [http://frutch.free.fr/wikini/ Frutch Wiki] -- French Nutch Wiki * The [http://nutch.sourceforge.net/cgi-bin/twiki/view/Main/Nutch Old Wiki] - * [Search Theory Papers] + * [Search Theory Search Theory White Papers]
[Nutch Wiki] Update of FrontPage by ByronMiller
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by ByronMiller: http://wiki.apache.org/nutch/FrontPage -- * [http://wiki.media-style.com/display/nutchDocu/Home Stefan's Nutch Documentation] * [http://frutch.free.fr/wikini/ Frutch Wiki] -- French Nutch Wiki * The [http://nutch.sourceforge.net/cgi-bin/twiki/view/Main/Nutch Old Wiki] - * [Search_Theory] Search Theory White Papers + * [Search_Theory] Search Theory White Papers
[Nutch Wiki] Update of Search Theory by ByronMiller
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by ByronMiller: http://wiki.apache.org/nutch/Search_Theory The comment on the change is: cleanup of mess :) -- + ## page was renamed from Search Theory Papers Search Theory White Papers. Publicly available white papers, best practices, theories and publications about search related topics. @@ -10, +11 @@ [http://www.cs.toronto.edu/~georgem/hilltop/ Hilltop] Search Engine based on expert Documents. -
svn commit: r365850 - in /lucene/nutch/trunk/src/plugin/protocol-httpclient: ./ lib/ src/java/org/apache/nutch/protocol/httpclient/
Author: ab Date: Tue Jan 3 23:32:04 2006 New Revision: 365850 URL: http://svn.apache.org/viewcvs?rev=365850view=rev Log: Update Commons HTTPClient to v. 3.0. Add some default headers to prefer HTML content, and in English. Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar (with props) Removed: lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar?rev=365850view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml?rev=365850r1=365849r2=365850view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Tue Jan 3 23:32:04 2006 @@ -10,7 +10,7 @@ export name=*/ /library library name=commons-codec.jar / - library name=commons-httpclient-3.0-rc2.jar / + library name=commons-httpclient-3.0.jar / /runtime requires Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=365850r1=365849r2=365850view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Tue Jan 3 23:32:04 2006 @@ -7,12 +7,14 @@ import java.net.MalformedURLException; import java.net.URL; import java.net.UnknownHostException; +import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.httpclient.Credentials; +import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; @@ -358,6 +360,15 @@ } HostConfiguration hostConf = client.getHostConfiguration(); +ArrayList headers = new ArrayList(); +// prefer English +headers.add(new Header(Accept-Language, en-us,en-gb,en;q=0.7,*;q=0.3)); +// prefer UTF-8 +headers.add(new Header(Accept-Charset, utf-8,ISO-8859-1;q=0.7,*;q=0.7)); +// prefer understandable formats +headers.add(new Header(Accept, + text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5)); +hostConf.getParams().setParameter(http.default-headers, headers); if (PROXY) { hostConf.setProxy(PROXY_HOST, PROXY_PORT); } Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=365850r1=365849r2=365850view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Tue Jan 3 23:32:04 2006 @@ -88,7 +88,7 @@ Header[] heads = get.getResponseHeaders(); for (int i = 0; i heads.length; i++) { -headers.put(heads[i].getName(), heads[i].getValue()); +headers.setProperty(heads[i].getName(), heads[i].getValue()); } // always read content. Sometimes content is useful to find a cause // for error.