svn commit: r365576 - /lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java

2006-01-03 Thread ab
Author: ab
Date: Tue Jan  3 00:35:04 2006
New Revision: 365576

URL: http://svn.apache.org/viewcvs?rev=365576view=rev
Log:
Fixed an NPE, in case of a fetch error we don't have a score value
from Fetcher.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=365576r1=365575r2=365576view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
Tue Jan  3 00:35:04 2006
@@ -37,7 +37,8 @@
   String name) throws IOException {
 
 final float interval = job.getFloat(db.default.fetch.interval, 30f);
-
+final float extscore = job.getFloat(db.score.link.external, 1.0f);
+
 File text =
   new File(new File(job.getOutputDir(), ParseText.DIR_NAME), name);
 File data =
@@ -81,8 +82,10 @@
   Outlink[] links = parse.getData().getOutlinks();
 
   // compute OPIC score contribution
-  float score =
-Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY));
+  String scoreString = parse.getData().get(Fetcher.SCORE_KEY);
+  float score = extscore;
+  // this may happen if there was a fetch error.
+ if (scoreString != null) score = Float.parseFloat(scoreString);
   score /= links.length;
   
   for (int i = 0; i  links.length; i++) {




[Nutch Wiki] Trivial Update of FAQ by GalNitzan

2006-01-03 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by GalNitzan:
http://wiki.apache.org/nutch/FAQ

--
  To solve this problem, add the following java param to the java instantiation 
in bin/nutch:
  
  JAVA_IPV4=-Djava.net.preferIPv4Stack=true
+ 
  # run it
  exec $JAVA $JAVA_HEAP_MAX $NUTCH_OPTS $JAVA_IPV4 -classpath $CLASSPATH 
$CLASS $@
  


[Nutch Wiki] Update of FrontPage by ByronMiller

2006-01-03 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by ByronMiller:
http://wiki.apache.org/nutch/FrontPage

--
   * [http://wiki.media-style.com/display/nutchDocu/Home Stefan's Nutch 
Documentation]
   * [http://frutch.free.fr/wikini/ Frutch Wiki] -- French Nutch Wiki
   * The [http://nutch.sourceforge.net/cgi-bin/twiki/view/Main/Nutch Old Wiki]
+  * [Search Theory  Papers]
  


[Nutch Wiki] Update of FrontPage by ByronMiller

2006-01-03 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by ByronMiller:
http://wiki.apache.org/nutch/FrontPage

--
   * [http://wiki.media-style.com/display/nutchDocu/Home Stefan's Nutch 
Documentation]
   * [http://frutch.free.fr/wikini/ Frutch Wiki] -- French Nutch Wiki
   * The [http://nutch.sourceforge.net/cgi-bin/twiki/view/Main/Nutch Old Wiki]
-  * [Search Theory  Papers]
+  * [Search Theory  Search Theory  White Papers]
  


[Nutch Wiki] Update of FrontPage by ByronMiller

2006-01-03 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by ByronMiller:
http://wiki.apache.org/nutch/FrontPage

--
   * [http://wiki.media-style.com/display/nutchDocu/Home Stefan's Nutch 
Documentation]
   * [http://frutch.free.fr/wikini/ Frutch Wiki] -- French Nutch Wiki
   * The [http://nutch.sourceforge.net/cgi-bin/twiki/view/Main/Nutch Old Wiki]
-  * [Search_Theory] Search Theory  White Papers
+  * [Search_Theory] Search Theory  White Papers
  


[Nutch Wiki] Update of Search Theory by ByronMiller

2006-01-03 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by ByronMiller:
http://wiki.apache.org/nutch/Search_Theory

The comment on the change is:
cleanup of mess :)

--
+ ## page was renamed from Search Theory  Papers
  Search Theory  White Papers.
  
  Publicly available white papers, best practices, theories and publications 
about search related topics.  
@@ -10, +11 @@

  
  [http://www.cs.toronto.edu/~georgem/hilltop/ Hilltop] Search Engine based on 
expert Documents.
  
- 


svn commit: r365850 - in /lucene/nutch/trunk/src/plugin/protocol-httpclient: ./ lib/ src/java/org/apache/nutch/protocol/httpclient/

2006-01-03 Thread ab
Author: ab
Date: Tue Jan  3 23:32:04 2006
New Revision: 365850

URL: http://svn.apache.org/viewcvs?rev=365850view=rev
Log:
Update Commons HTTPClient to v. 3.0.

Add some default headers to prefer HTML content, and in English.


Added:

lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar
   (with props)
Removed:

lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar
Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

Added: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar?rev=365850view=auto
==
Binary file - no diff available.

Propchange: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar
--
svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml?rev=365850r1=365849r2=365850view=diff
==
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Tue Jan  3 
23:32:04 2006
@@ -10,7 +10,7 @@
  export name=*/
   /library
   library name=commons-codec.jar /
-  library name=commons-httpclient-3.0-rc2.jar /
+  library name=commons-httpclient-3.0.jar /
/runtime
 
requires

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=365850r1=365849r2=365850view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 Tue Jan  3 23:32:04 2006
@@ -7,12 +7,14 @@
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.UnknownHostException;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
 import org.apache.commons.httpclient.Credentials;
+import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HostConfiguration;
 import org.apache.commons.httpclient.HttpClient;
 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
@@ -358,6 +360,15 @@
 }
 
 HostConfiguration hostConf = client.getHostConfiguration();
+ArrayList headers = new ArrayList();
+// prefer English
+headers.add(new Header(Accept-Language, en-us,en-gb,en;q=0.7,*;q=0.3));
+// prefer UTF-8
+headers.add(new Header(Accept-Charset, 
utf-8,ISO-8859-1;q=0.7,*;q=0.7));
+// prefer understandable formats
+headers.add(new Header(Accept,
+
text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5));
+hostConf.getParams().setParameter(http.default-headers, headers);
 if (PROXY) {
   hostConf.setProxy(PROXY_HOST, PROXY_PORT);
 }

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=365850r1=365849r2=365850view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Tue Jan  3 23:32:04 2006
@@ -88,7 +88,7 @@
   Header[] heads = get.getResponseHeaders();
 
   for (int i = 0; i  heads.length; i++) {
-headers.put(heads[i].getName(), heads[i].getValue());
+headers.setProperty(heads[i].getName(), heads[i].getValue());
   }
   // always read content. Sometimes content is useful to find a cause
   // for error.