Author: jnioche
Date: Mon Mar 22 09:00:11 2010
New Revision: 926003

URL: http://svn.apache.org/viewvc?rev=926003&view=rev
Log:
NUTCH-740 Configuration option to override default language for fetched pages

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/nutch-default.xml
    
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=926003&r1=926002&r2=926003&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 22 09:00:11 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-740 Configuration option to override default language for fetched 
pages (Marcin Okraszewski via jnioche)
+
 * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab)
 
 * NUTCH-787 Upgrade Lucene to 3.0.1. (Dawid Weiss via ab)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=926003&r1=926002&r2=926003&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 22 09:00:11 2010
@@ -228,6 +228,15 @@
   </description>
 </property>
 
+<property>
+  <name>http.accept.language</name>
+  <value>en-us,en-gb,en;q=0.7,*;q=0.3</value>
+  <description>Value of the "Accept-Language" request header field.
+  This allows selecting non-English language as default one to retrieve.
+  It is a useful setting for search engines build for certain national group.
+  </description>
+</property>
+
 <!-- FTP properties -->
 
 <property>

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=926003&r1=926002&r2=926003&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Mon Mar 22 09:00:11 2010
@@ -93,6 +93,8 @@ public abstract class HttpBase implement
                         "http://lucene.apache.org/nutch/bot.html";,
                         "nutch-ag...@lucene.apache.org");
 
+  /** The "Accept-Language" request header value. */
+  protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
     
   /**
    * Maps from host to a Long naming the time it should be unblocked.
@@ -162,6 +164,7 @@ public abstract class HttpBase implement
         this.maxThreadsPerHost = conf.getInt("fetcher.threads.per.host", 1);
         this.userAgent = getAgentString(conf.get("http.agent.name"), 
conf.get("http.agent.version"), conf
                 .get("http.agent.description"), conf.get("http.agent.url"), 
conf.get("http.agent.email"));
+        this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
         this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) 
* 1000);
         this.maxCrawlDelay = (long)(conf.getInt("fetcher.max.crawl.delay", -1) 
* 1000);
         // backward-compatible default setting
@@ -326,6 +329,13 @@ public abstract class HttpBase implement
     return userAgent;
   }
   
+  /** Value of "Accept-Language" request header sent by Nutch.
+   * @return The value of the header "Accept-Language" header.
+   */
+  public String getAcceptLanguage() {
+         return acceptLanguage;
+  }
+
   public boolean getUseHttp11() {
     return useHttp11;
   }
@@ -470,6 +480,7 @@ public abstract class HttpBase implement
       logger.info("http.timeout = " + timeout);
       logger.info("http.content.limit = " + maxContent);
       logger.info("http.agent = " + userAgent);
+      logger.info("http.accept.language = " + acceptLanguage);
       logger.info(Protocol.CHECK_BLOCKING + " = " + checkBlocking);
       logger.info(Protocol.CHECK_ROBOTS + " = " + checkRobots);
       if (checkBlocking) {

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=926003&r1=926002&r2=926003&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Mon Mar 22 09:00:11 2010
@@ -123,6 +123,10 @@ public class HttpResponse implements Res
         reqStr.append(userAgent);
         reqStr.append("\r\n");
       }
+      
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
 
       reqStr.append("\r\n");
       if (datum.getModifiedTime() > 0) {

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=926003&r1=926002&r2=926003&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 Mon Mar 22 09:00:11 2010
@@ -185,7 +185,7 @@ public class Http extends HttpBase {
     // Set the User Agent in the header
     headers.add(new Header("User-Agent", userAgent));
     // prefer English
-    headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3"));
+    headers.add(new Header("Accept-Language", acceptLanguage));
     // prefer UTF-8
     headers.add(new Header("Accept-Charset", 
"utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
     // prefer understandable formats


Reply via email to