Author: ab Date: Wed Mar 7 15:37:21 2007 New Revision: 515844 URL: http://svn.apache.org/viewvc?view=rev&rev=515844 Log: NUTCH-167 - Observation of robots "noarchive" directive.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java lucene/nutch/trunk/src/web/jsp/cached.jsp lucene/nutch/trunk/src/web/jsp/search.jsp Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=515844&r1=515843&r2=515844 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Mar 7 15:37:21 2007 @@ -151,7 +151,9 @@ 50. NUTCH-432 - Fix a bug where platform name with spaces would break the bin/nutch script. (Brian Whitman via ab) -51. Upgrade to Hadoop 0.11.2 and Lucene 2.1.0 release. +51. Upgrade to Hadoop 0.11.2 and Lucene 2.1.0 release. (ab) + +52. NUTCH-167 - Observation of robots "noarchive" directive. (ab) Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=515844&r1=515843&r2=515844 ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Wed Mar 7 15:37:21 2007 @@ -773,6 +773,17 @@ </property> <property> + <name>parser.caching.forbidden.policy</name> + <value>content</value> + <description>If a site (or a page) requests through its robot metatags + that it should not be shown as cached content, apply this policy. Currently + three keywords are recognized: "none" ignores any "noarchive" directives. + "content" doesn't show the content, but shows summaries (snippets). + "all" doesn't show either content or summaries.</description> +</property> + + +<property> <name>parser.html.impl</name> <value>neko</value> <description>HTML Parser implementation. Currently the following keywords Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?view=diff&rev=515844&r1=515843&r2=515844 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Wed Mar 7 15:37:21 2007 @@ -47,4 +47,16 @@ public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(PROTO_STATUS_KEY); + /** Sites may request that search engines don't provide access to cached documents. */ + public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden"; + + /** Show both original forbidden content and summaries (default). */ + public static final String CACHING_FORBIDDEN_NONE = "none"; + + /** Don't show either original forbidden content or summaries. */ + public static final String CACHING_FORBIDDEN_ALL = "all"; + + /** Don't show original forbidden content, but show summaries. */ + public static final String CACHING_FORBIDDEN_CONTENT = "content"; + } Modified: lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?view=diff&rev=515844&r1=515843&r2=515844 ============================================================================== --- lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Wed Mar 7 15:37:21 2007 @@ -24,6 +24,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.parse.Parse; import org.apache.nutch.indexer.IndexingFilter; @@ -89,6 +90,11 @@ } // add title indexed and stored so that it can be displayed doc.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED)); + // add cached content/summary display policy, if available + String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY); + if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) { + doc.add(new Field("cache", caching, Field.Store.YES, Field.Index.NO)); + } // add timestamp when fetched, for deduplication doc.add(new Field("tstamp", Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?view=diff&rev=515844&r1=515843&r2=515844 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Wed Mar 7 15:37:21 2007 @@ -105,6 +105,11 @@ if (index >= 0) { metaTags.setNoFollow(); } + + index = directives.indexOf("noarchive"); + if (index >= 0) { + metaTags.setNoCache(); + } } } // end if (name == robots) Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?view=diff&rev=515844&r1=515843&r2=515844 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Mar 7 15:37:21 2007 @@ -33,6 +33,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.Content; import org.apache.hadoop.conf.*; @@ -100,7 +101,9 @@ private DOMContentUtils utils; private HtmlParseFilters htmlParseFilters; - + + private String cachingPolicy; + public Parse getParse(Content content) { HTMLMetaTags metaTags = new HTMLMetaTags(); @@ -202,10 +205,6 @@ } } - if (!metaTags.getNoCache()) { // okay to cache - // ??? FIXME ??? - } - ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); if (metaTags.getRefresh()) { status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); @@ -217,7 +216,11 @@ Parse parse = new ParseImpl(text, parseData); // run filters on parse - return this.htmlParseFilters.filter(content, parse, metaTags, root); + parse = this.htmlParseFilters.filter(content, parse, metaTags, root); + if (metaTags.getNoCache()) { // not okay to cache + parse.getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); + } + return parse; } private DocumentFragment parse(InputSource input) throws Exception { @@ -302,6 +305,8 @@ this.defaultCharEncoding = getConf().get( "parser.character.encoding.default", "windows-1252"); this.utils = new DOMContentUtils(conf); + this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", + Nutch.CACHING_FORBIDDEN_CONTENT); } public Configuration getConf() { Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/cached.jsp?view=diff&rev=515844&r1=515843&r2=515844 ============================================================================== --- lucene/nutch/trunk/src/web/jsp/cached.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/cached.jsp Wed Mar 7 15:37:21 2007 @@ -23,6 +23,7 @@ import="org.apache.nutch.searcher.*" import="org.apache.nutch.parse.ParseData" import="org.apache.nutch.metadata.Metadata" + import="org.apache.nutch.metadata.Nutch" import="org.apache.hadoop.conf.Configuration" import="org.apache.nutch.util.NutchConfiguration" %><% @@ -82,6 +83,17 @@ FIXME: have to sanitize 'content' : e.g. removing unncessary part of head elememt --> +<% + String caching = details.getValue("cache"); + String url = details.getValue("url"); + if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) { +%> +Display of this content was administratively prohibited by the webmaster. +You may visit the original page instead: <a href="<%=url%>"><%=url%></a>. +<% + return; + } +%> <% if (contentType.startsWith("text/html")) {%> <% if (content != null && !content.equals("")) {%> Modified: lucene/nutch/trunk/src/web/jsp/search.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/search.jsp?view=diff&rev=515844&r1=515843&r2=515844 ============================================================================== --- lucene/nutch/trunk/src/web/jsp/search.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/search.jsp Wed Mar 7 15:37:21 2007 @@ -24,6 +24,7 @@ import="java.net.*" import="org.apache.nutch.html.Entities" + import="org.apache.nutch.metadata.Nutch" import="org.apache.nutch.searcher.*" import="org.apache.nutch.plugin.*" import="org.apache.nutch.clustering.*" @@ -194,7 +195,6 @@ Hit[] show = hits.getHits(start, realEnd-start); HitDetails[] details = bean.getDetails(show); Summary[] summaries = bean.getSummary(details, query); - bean.LOG.info("total hits: " + hits.getTotal()); %> @@ -228,6 +228,13 @@ String url = detail.getValue("url"); String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); String summary = summaries[i].toHtml(true); + String caching = detail.getValue("cache"); + boolean showSummary = true; + boolean showCached = true; + if (caching != null) { + showSummary = !caching.equals(Nutch.CACHING_FORBIDDEN_ALL); + showCached = !caching.equals(Nutch.CACHING_FORBIDDEN_NONE); + } if (title == null || title.equals("")) { // use url for docs w/o title title = url; @@ -235,12 +242,16 @@ %> <b><a href="<%=url%>"><%=Entities.encode(title)%></a></b> <%@ include file="more.jsp" %> - <% if (!"".equals(summary)) { %> + <% if (!"".equals(summary) && showSummary) { %> <br><%=summary%> <% } %> <br> <span class="url"><%=Entities.encode(url)%></span> - (<a href="../cached.jsp?<%=id%>"><i18n:message key="cached"/></a>) + <% + if (showCached) { + %>(<a href="../cached.jsp?<%=id%>"><i18n:message key="cached"/></a>) <% + } + %> (<a href="../explain.jsp?<%=id%>&query=<%=URLEncoder.encode(queryString, "UTF-8")%>&lang=<%=queryLang%>"><i18n:message key="explain"/></a>) (<a href="../anchors.jsp?<%=id%>"><i18n:message key="anchors"/></a>) <% if (hit.moreFromDupExcluded()) { ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs