Update of /cvsroot/nutch/playground/conf
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13033/conf
Added Files:
nutch-default.xml .cvsignore banned-hosts.txt nutch-conf.xsl
regex-urlfilter-default.txt common-terms.utf8
Log Message:
intial commit
--- NEW FILE: nutch-default.xml ---
<?xml version="1.0" ?>
<?xml:stylesheet type="text/xsl" href="nutch-conf.xsl"?>
<!-- Do not modify this file directly. Instead, copy entries that you -->
<!-- wish to modify from this file into nutch-site.xml and change them -->
<!-- there. If nutch-site.xml does not already exist, create it. -->
<nutch-conf>
<!-- HTTP properties -->
<property>
<name>http.agent.name</name>
<value>NutchCVS</value>
<description>Our HTTP 'User-Agent' request header.</description>
</property>
<property>
<name>http.robots.agents</name>
<value>NutchCVS,Nutch,*</value>
<description>The agent strings we'll look for in robots.txt files,
comma-separated, in decreasing order of precedence.</description>
</property>
<property>
<name>http.agent.description</name>
<value>Nutch</value>
<description>Further description of our bot- this text is used in
the User-Agent header. It appears in parenthesis after the agent name.
</description>
</property>
<property>
<name>http.agent.url</name>
<value>http://www.nutch.org/docs/en/bot.html</value>
<description>A URL to advertise in the User-Agent header. This will
appear in parenthesis after the agent name.
</description>
</property>
<property>
<name>http.agent.email</name>
<value>[EMAIL PROTECTED]</value>
<description>An email address to advertise in the HTTP 'From' request
header and User-Agent header.</description>
</property>
<property>
<name>http.agent.version</name>
<value>0.03-dev</value>
<description>A version string to advertise in the User-Agent
header.</description>
</property>
<property>
<name>http.timeout</name>
<value>10000</value>
<description>The default network timeout, in milliseconds.</description>
</property>
<property>
<name>http.content.limit</name>
<value>65536</value>
<description>The default length limit for downloaded content, in
bytes. Content longer than this is truncated.</description>
</property>
<property>
<name>http.version.1.1</name>
<value>true</value>
<description>If true, the fetcher will attempt to use HTTP version 1.1
and gzip encoding.</description>
</property>
<!-- web db properties -->
<property>
<name>db.default.fetch.interval</name>
<value>30</value>
<description>The default number of days between re-fetches of a page.
</description>
</property>
<property>
<name>db.score.injected</name>
<value>2.0</value>
<description>The score of new pages added by the injector.
</description>
</property>
<property>
<name>db.score.link.external</name>
<value>1.0f</value>
<description>The score factor for new pages added due to a link from
another host relative to the referencing page's score.
</description>
</property>
<property>
<name>db.score.link.internal</name>
<value>0.5</value>
<description>The score factor for pages added due to a link from the
same host, relative to the referencing page's score.
</description>
</property>
<property>
<name>db.max.outlinks.per.page</name>
<value>100</value>
<description>The maximum number of outlinks that we'll process for a page.
</description>
</property>
<property>
<name>db.max.anchor.length</name>
<value>100</value>
<description>The maximum number of characters permitted in an anchor.
</description>
</property>
<!-- fetcher properties -->
<property>
<name>fetcher.server.delay</name>
<value>20</value>
<description>The number of seconds the fetcher will delay between
successive requests to the same server.</description>
</property>
<property>
<name>fetcher.threads.fetch</name>
<value>300</value>
<description>The number of FetcherThreads the fetcher should use.
This is also determines the maximum number of requests that are
made at once (each FetcherThread handles one connection).</description>
</property>
<property>
<name>fetcher.threads.output</name>
<value>5</value>
<description>The number of OutputThreads to use. When adjusting
this, remember that each thread could be holding a raw page, it's
DOM structure, plaintext, and extracted links in memory.
</description>
</property>
<property>
<name>fetcher.stats.minutes</name>
<value>2</value>
<description>Controls how often the fetcher will dump progress statistics
to the logs, in minutes.</description>
</property>
<property>
<name>fetcher.request.queue</name>
<value>25000</value>
<description>The maximum number of unfetched requests to queue in
memory.</description>
</property>
<property>
<name>fetcher.output.queue</name>
<value>250</value>
<description>The maximum number of completed (but unwritten) requests to
queue in memory before throttling the fetcher.</description>
</property>
<property>
<name>fetcher.active.servers</name>
<value>10000</value>
<description>The maximum number of distinct servers that may be referenced
by queued requests.</description>
</property>
<property>
<name>fetcher.robots.cache</name>
<value>20000</value>
<description>The minimum number of robots.txt files to cache for
inactive servers.</description>
</property>
<property>
<name>fetcher.server.maxurls</name>
<value>5000</value>
<description>The maximum number of URLs that may be queued at once for a
single host.</description>
</property>
<property>
<name>fetcher.lowservers.threshold</name>
<value>1000</value>
<description>When there are fewer than this many servers in the fetcher's
active queues, each server's queue of URLs will be pruned to
fetcher.lowservers.maxurls.
</description>
</property>
<property>
<name>fetcher.lowservers.maxurls</name>
<value>50</value>
<description>See description of fetcher.lowservers.threshold.
</description>
</property>
<property>
<name>fetcher.retry.max</name>
<value>3</value>
<description>The maximum number of times the fetcher will attempt to get
a page that has encountered recoverable errors. </description>
</property>
<property>
<name>fetcher.redirect.max</name>
<value>3</value>
<description>The maximum number of redirects the fetcher will follow when
trying to fetch a page.</description>
</property>
<property>
<name>fetcher.host.consecutive.failures</name>
<value>3</value>
<description>
The maximum number of consecutive failures, excluding 404 errors, to
allow on a given server before declaring it dead (note: each failure
will have had up to fetcher.retry.max retries). </description>
</property>
<property>
<name>fetcher.host.max.failerr.rate</name>
<value>.5</value>
<description>The maximum fetch error rate, excluding 404s, to allow for
a given server before declaring it dead. Note: errors include
transient issues, and multiple retries contribute to the score (so,
getting the first page on the 3rd try gives you a .66 "failerr.rate").
</description>
</property>
<property>
<name>fetcher.host.min.requests.rate</name>
<value>10</value>
<description>A threshold on the minimum number of requests we issue
to a host before applying fetcher.host.max.failerr.rate. At least
this many requests will be issued before declaring a host dead due
to error rate. Note: this setting does not affect
fetcher.host.consecutive.failures!
</description>
</property>
<property>
<name>excludehosts.suffix.file</name>
<value>banned-hosts.txt</value>
<description>Filename which contains list of hostnames we shouldn't
fetch from.</description>
</property>
<property>
<name>fetcher.trace.longmsg</name>
<value>false</value>
<description>Whether to use "long messages" is the trace portion
of the logged output (if set to false, terse messages will be used).
</description>
</property>
<property>
<name>fetcher.trace.success</name>
<value>false</value>
<description>Whether to log successful fetches in the trace log.
</description>
</property>
<property>
<name>fetcher.trace.not.found</name>
<value>false</value>
<description>Whether to log 404/Not Found errors in the trace log.
</description>
</property>
<property>
<name>fetcher.throttle.period</name>
<value>30</value>
<description>How often throttling behavior should be readjusted based
on current bandwidth usage, measured in seconds. Set to -1 to disable
throttling.
</description>
</property>
<property>
<name>fetcher.throttle.bandwidth</name>
<value>-1</value>
<description>The desired amount of bandwidth the fetcher should use
(aside from DNS and TCP overhead), in kbits/s. Set to -1 to disable
throttling. Note: This is *not* a cap, this is a target for
bandwidth usage over time.
</description>
</property>
<property>
<name>fetcher.throttle.initial.threads</name>
<value>10</value>
<description>The number of threads that should be active initially.
</description>
</property>
<!-- i/o properties -->
<property>
<name>io.sort.factor</name>
<value>100</value>
<description>The number of streams to merge at once while sorting
files. This determines the number of open file handles.</description>
</property>
<property>
<name>io.sort.mb</name>
<value>100</value>
<description>The total amount of buffer memory to use while sorting
files, in megabytes. By default, gives each merge stream 1MB, which
should minimize seeks.</description>
</property>
<!-- indexer properties -->
<property>
<name>indexer.score.power</name>
<value>0.5</value>
<description>Determines the power of link analyis scores. Each
pages's boost is set to <i>score<sup>scorePower</sup></i> where
<i>score</i> is its link analysis score and <i>scorePower</i> is the
value of this parameter. This is compiled into indexes, so, when
this is changed, pages must be re-indexed for it to take
effect.</description>
</property>
<property>
<name>indexer.max.title.length</name>
<value>100</value>
<description>The maximum number of characters of a title that are indexed.
</description>
</property>
<!-- analysis properties -->
<property>
<name>analysis.common.terms.file</name>
<value>common-terms.utf8</value>
<description>The name of a file containing a list of common terms
that should be indexed in n-grams.</description>
</property>
<!-- searcher properties -->
<property>
<name>searcher.dir</name>
<value>.</value>
<description>
Path to root of index directories. This directory is searched (in
order) for either the file search-servers.txt, containing a list of
distributed search servers, or the directory "index" containing
merged indexes, or the directory "segments" containing segment
indexes.
</description>
</property>
<!-- urlfilter properties -->
<property>
<name>urlfilter.class</name>
<value>net.nutch.net.RegexURLFilter</value>
<description>Name of the class used to filter URLs.</description>
</property>
<property>
<name>urlfilter.regex.file</name>
<value>regex-urlfilter-default.txt</value>
<description>Name of file on CLASSPATH containing default regular
expressions used by RegexURLFilter.</description>
</property>
<!-- plugin properties -->
<property>
<name>plugin.folder</name>
<value>plugins</value>
<description>A Directory where nutch plugin are located</description>
</property>
</nutch-conf>
--- NEW FILE: .cvsignore ---
nutch-site.xml
--- NEW FILE: banned-hosts.txt ---
# list domains you don't want to crawl (by suffix) in this file.
# hash-marks indicate comments, and lines will be whitespace trimmed.
# list only one domain per line.
--- NEW FILE: nutch-conf.xsl ---
<?xml version="1.0"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:output method="html"/>
<xsl:template match="nutch-conf">
<html>
<body>
<table border="1">
<tr>
<td>name</td>
<td>value</td>
<td>description</td>
</tr>
<xsl:for-each select="property">
<tr>
<td><xsl:value-of select="name"/></td>
<td><xsl:value-of select="value"/></td>
<td><xsl:value-of select="description"/></td>
</tr>
</xsl:for-each>
</table>
</body>
</html>
</xsl:template>
</xsl:stylesheet>
--- NEW FILE: regex-urlfilter-default.txt ---
# The default config file for RegexURLFilter.
# Each non-comment, non-blank line contains a regular expression
# prefixed by '+' or '-'. The first matching pattern in the file
# determines whether a URL is included or ignored. If no pattern
# matches, the URL is ignored.
# Override this file by creating a new file, copying this file. Then,
# in nutch-site.xml, set urlfilter.regex.file to the name of this new
# file. (If nutch-site.xml does not exist, create it. It has the
# same format as nutch-default.xml, and specifies overrides.)
# skip 'file:' urls
-^file:
# skip image and other suffixes we can't yet parse
-\.(gif|GIF|jpg|JPG|pdf|PDF|ico|ICO|css|css)$
# skip URLs containing certain characters as probable queries, etc.
[EMAIL PROTECTED]
# accept anything else
+.
--- NEW FILE: common-terms.utf8 ---
# Common terms and phrases which will be indexed in n-grams
# in order to optimize search.
content:a
content:and
content:for
content:in
content:of
content:the
content:to
url:com
url:http
url:http-www
url:www
-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs