Author: jnioche Date: Mon Mar 22 09:00:11 2010 New Revision: 926003 URL: http://svn.apache.org/viewvc?rev=926003&view=rev Log: NUTCH-740 Configuration option to override default language for fetched pages
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=926003&r1=926002&r2=926003&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 22 09:00:11 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-740 Configuration option to override default language for fetched pages (Marcin Okraszewski via jnioche) + * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab) * NUTCH-787 Upgrade Lucene to 3.0.1. (Dawid Weiss via ab) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=926003&r1=926002&r2=926003&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 22 09:00:11 2010 @@ -228,6 +228,15 @@ </description> </property> +<property> + <name>http.accept.language</name> + <value>en-us,en-gb,en;q=0.7,*;q=0.3</value> + <description>Value of the "Accept-Language" request header field. + This allows selecting non-English language as default one to retrieve. + It is a useful setting for search engines build for certain national group. + </description> +</property> + <!-- FTP properties --> <property> Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=926003&r1=926002&r2=926003&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Mar 22 09:00:11 2010 @@ -93,6 +93,8 @@ public abstract class HttpBase implement "http://lucene.apache.org/nutch/bot.html", "nutch-ag...@lucene.apache.org"); + /** The "Accept-Language" request header value. */ + protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3"; /** * Maps from host to a Long naming the time it should be unblocked. @@ -162,6 +164,7 @@ public abstract class HttpBase implement this.maxThreadsPerHost = conf.getInt("fetcher.threads.per.host", 1); this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email")); + this.acceptLanguage = conf.get("http.accept.language", acceptLanguage); this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000); this.maxCrawlDelay = (long)(conf.getInt("fetcher.max.crawl.delay", -1) * 1000); // backward-compatible default setting @@ -326,6 +329,13 @@ public abstract class HttpBase implement return userAgent; } + /** Value of "Accept-Language" request header sent by Nutch. + * @return The value of the header "Accept-Language" header. + */ + public String getAcceptLanguage() { + return acceptLanguage; + } + public boolean getUseHttp11() { return useHttp11; } @@ -470,6 +480,7 @@ public abstract class HttpBase implement logger.info("http.timeout = " + timeout); logger.info("http.content.limit = " + maxContent); logger.info("http.agent = " + userAgent); + logger.info("http.accept.language = " + acceptLanguage); logger.info(Protocol.CHECK_BLOCKING + " = " + checkBlocking); logger.info(Protocol.CHECK_ROBOTS + " = " + checkRobots); if (checkBlocking) { Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=926003&r1=926002&r2=926003&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Mon Mar 22 09:00:11 2010 @@ -123,6 +123,10 @@ public class HttpResponse implements Res reqStr.append(userAgent); reqStr.append("\r\n"); } + + reqStr.append("Accept-Language: "); + reqStr.append(this.http.getAcceptLanguage()); + reqStr.append("\r\n"); reqStr.append("\r\n"); if (datum.getModifiedTime() > 0) { Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=926003&r1=926002&r2=926003&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Mon Mar 22 09:00:11 2010 @@ -185,7 +185,7 @@ public class Http extends HttpBase { // Set the User Agent in the header headers.add(new Header("User-Agent", userAgent)); // prefer English - headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3")); + headers.add(new Header("Accept-Language", acceptLanguage)); // prefer UTF-8 headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7")); // prefer understandable formats