[Nutch-cvs] playground/conf nutch-default.xml,NONE,1.1 .cvsignore,NONE,1.1 banned-hosts.txt,NONE,1.1 nutch-conf.xsl,NONE,1.1 regex-urlfilter-default.txt,NONE,1.1 common-terms.utf8,NONE,1.1

joa23 Wed, 28 Jan 2004 17:18:27 -0800

Update of /cvsroot/nutch/playground/conf
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13033/conf


Added Files:
        nutch-default.xml .cvsignore banned-hosts.txt nutch-conf.xsl 
        regex-urlfilter-default.txt common-terms.utf8 
Log Message:
intial commit

--- NEW FILE: nutch-default.xml ---
<?xml version="1.0" ?> 
<?xml:stylesheet type="text/xsl" href="nutch-conf.xsl"?>

<!-- Do not modify this file directly.  Instead, copy entries that you -->
<!-- wish to modify from this file into nutch-site.xml and change them -->
<!-- there.  If nutch-site.xml does not already exist, create it.      -->

<nutch-conf>

<!-- HTTP properties -->

<property>
  <name>http.agent.name</name>
  <value>NutchCVS</value>
  <description>Our HTTP 'User-Agent' request header.</description>
</property>

<property>
  <name>http.robots.agents</name>
  <value>NutchCVS,Nutch,*</value>
  <description>The agent strings we'll look for in robots.txt files,
  comma-separated, in decreasing order of precedence.</description>
</property>

<property>
  <name>http.agent.description</name>
  <value>Nutch</value>
  <description>Further description of our bot- this text is used in
  the User-Agent header.  It appears in parenthesis after the agent name.
  </description>
</property>

<property>
  <name>http.agent.url</name>
  <value>http://www.nutch.org/docs/en/bot.html</value>
  <description>A URL to advertise in the User-Agent header.  This will 
   appear in parenthesis after the agent name.
  </description>
</property>

<property>
  <name>http.agent.email</name>
  <value>[EMAIL PROTECTED]</value>
  <description>An email address to advertise in the HTTP 'From' request
   header and User-Agent header.</description>
</property>

<property>
  <name>http.agent.version</name>
  <value>0.03-dev</value>
  <description>A version string to advertise in the User-Agent 
   header.</description>
</property>

<property>
  <name>http.timeout</name>
  <value>10000</value>
  <description>The default network timeout, in milliseconds.</description>
</property>

<property>
  <name>http.content.limit</name>
  <value>65536</value>
  <description>The default length limit for downloaded content, in
   bytes.  Content longer than this is truncated.</description>
</property>

<property>
  <name>http.version.1.1</name>
  <value>true</value>
  <description>If true, the fetcher will attempt to use HTTP version 1.1
  and gzip encoding.</description>
</property>

<!-- web db properties -->

<property>
  <name>db.default.fetch.interval</name>
  <value>30</value>
  <description>The default number of days between re-fetches of a page.
  </description>
</property>

<property>
  <name>db.score.injected</name>
  <value>2.0</value>
  <description>The score of new pages added by the injector.
  </description>
</property>

<property>
  <name>db.score.link.external</name>
  <value>1.0f</value>
  <description>The score factor for new pages added due to a link from
  another host relative to the referencing page's score.
  </description>
</property>

<property>
  <name>db.score.link.internal</name>
  <value>0.5</value>
  <description>The score factor for pages added due to a link from the
  same host, relative to the referencing page's score.
  </description>
</property>

<property>
  <name>db.max.outlinks.per.page</name>
  <value>100</value>
  <description>The maximum number of outlinks that we'll process for a page.
  </description>
</property>

<property>
  <name>db.max.anchor.length</name>
  <value>100</value>
  <description>The maximum number of characters permitted in an anchor.
  </description>
</property>

<!-- fetcher properties -->

<property>
  <name>fetcher.server.delay</name>
  <value>20</value>
  <description>The number of seconds the fetcher will delay between 
   successive requests to the same server.</description>
</property>

<property>
  <name>fetcher.threads.fetch</name>
  <value>300</value>
  <description>The number of FetcherThreads the fetcher should use.
    This is also determines the maximum number of requests that are 
    made at once (each FetcherThread handles one connection).</description>
</property>

<property>
  <name>fetcher.threads.output</name> 
  <value>5</value>
  <description>The number of OutputThreads to use.  When adjusting
    this, remember that each thread could be holding a raw page, it's
    DOM structure, plaintext, and extracted links in memory.
  </description> 
</property>

<property>
  <name>fetcher.stats.minutes</name>
  <value>2</value>
  <description>Controls how often the fetcher will dump progress statistics
    to the logs, in minutes.</description>
</property>

<property>
  <name>fetcher.request.queue</name>
  <value>25000</value>
  <description>The maximum number of unfetched requests to queue in
    memory.</description>
</property>

<property>
  <name>fetcher.output.queue</name>
  <value>250</value>
  <description>The maximum number of completed (but unwritten) requests to
    queue in memory before throttling the fetcher.</description>
</property>

<property>
  <name>fetcher.active.servers</name>
  <value>10000</value>
  <description>The maximum number of distinct servers that may be referenced
    by queued requests.</description>
</property>

<property>
  <name>fetcher.robots.cache</name>
  <value>20000</value>
  <description>The minimum number of robots.txt files to cache for 
    inactive servers.</description>
</property>

<property>
  <name>fetcher.server.maxurls</name>
  <value>5000</value>
  <description>The maximum number of URLs that may be queued at once for a 
    single host.</description>
</property>

<property>
  <name>fetcher.lowservers.threshold</name>
  <value>1000</value>
  <description>When there are fewer than this many servers in the fetcher's
    active queues, each server's queue of URLs will be pruned to 
    fetcher.lowservers.maxurls.
    </description>
</property>

<property>
  <name>fetcher.lowservers.maxurls</name>
  <value>50</value>
  <description>See description of fetcher.lowservers.threshold.
    </description>
</property>

<property>
  <name>fetcher.retry.max</name>
  <value>3</value>
  <description>The maximum number of times the fetcher will attempt to get 
    a page that has encountered recoverable errors. </description>
</property>

<property>
  <name>fetcher.redirect.max</name>
  <value>3</value>
  <description>The maximum number of redirects the fetcher will follow when
    trying to fetch a page.</description>
</property>

<property>
  <name>fetcher.host.consecutive.failures</name>
  <value>3</value>
  <description>
    The maximum number of consecutive failures, excluding 404 errors, to
    allow on a given server before declaring it dead (note: each failure
    will have had up to fetcher.retry.max retries). </description>
</property>

<property>
  <name>fetcher.host.max.failerr.rate</name>
  <value>.5</value>
  <description>The maximum fetch error rate, excluding 404s, to allow for 
    a given server before declaring it dead.  Note: errors include
    transient issues, and multiple retries contribute to the score (so, 
    getting the first page on the 3rd try gives you a .66 "failerr.rate"). 
  </description>
</property>

<property>
  <name>fetcher.host.min.requests.rate</name>
  <value>10</value>
  <description>A threshold on the minimum number of requests we issue 
    to a host before applying fetcher.host.max.failerr.rate.  At least
    this many requests will be issued before declaring a host dead due
    to error rate.  Note: this setting does not affect 
    fetcher.host.consecutive.failures!
    </description>
</property>

<property>
  <name>excludehosts.suffix.file</name>
  <value>banned-hosts.txt</value>
  <description>Filename which contains list of hostnames we shouldn't 
    fetch from.</description>
</property>

<property>
  <name>fetcher.trace.longmsg</name>
  <value>false</value>
  <description>Whether to use "long messages" is the trace portion
  of the logged output (if set to false, terse messages will be used).  
  </description>
</property>

<property>
  <name>fetcher.trace.success</name>
  <value>false</value>
  <description>Whether to log successful fetches in the trace log.
  </description>
</property>

<property>
  <name>fetcher.trace.not.found</name>
  <value>false</value>
  <description>Whether to log 404/Not Found errors in the trace log.
  </description>
</property>

<property>
  <name>fetcher.throttle.period</name>
  <value>30</value>
  <description>How often throttling behavior should be readjusted based
  on current bandwidth usage, measured in seconds.  Set to -1 to disable
  throttling.
  </description>
</property>

<property>
  <name>fetcher.throttle.bandwidth</name>
  <value>-1</value>
  <description>The desired amount of bandwidth the fetcher should use 
  (aside from DNS and TCP overhead), in kbits/s.   Set to -1 to disable 
  throttling.  Note: This is *not* a cap, this is a target for 
  bandwidth usage over time.
  </description>
</property>

<property>
  <name>fetcher.throttle.initial.threads</name>
  <value>10</value>
  <description>The number of threads that should be active initially.
  </description>
</property>

<!-- i/o properties -->

<property>
  <name>io.sort.factor</name>
  <value>100</value>
  <description>The number of streams to merge at once while sorting
  files.  This determines the number of open file handles.</description>
</property>

<property>
  <name>io.sort.mb</name>
  <value>100</value>
  <description>The total amount of buffer memory to use while sorting 
  files, in megabytes.  By default, gives each merge stream 1MB, which
  should minimize seeks.</description>
</property>

<!-- indexer properties -->

<property>
  <name>indexer.score.power</name>
  <value>0.5</value>
  <description>Determines the power of link analyis scores.  Each
  pages's boost is set to <i>score<sup>scorePower</sup></i> where
  <i>score</i> is its link analysis score and <i>scorePower</i> is the
  value of this parameter.  This is compiled into indexes, so, when
  this is changed, pages must be re-indexed for it to take
  effect.</description>
</property>

<property>
  <name>indexer.max.title.length</name>
  <value>100</value>
  <description>The maximum number of characters of a title that are indexed.
  </description>
</property>

<!-- analysis properties -->

<property>
  <name>analysis.common.terms.file</name>
  <value>common-terms.utf8</value>
  <description>The name of a file containing a list of common terms
  that should be indexed in n-grams.</description>
</property>

<!-- searcher properties -->
<property>
  <name>searcher.dir</name>
  <value>.</value>
  <description>
  Path to root of index directories.  This directory is searched (in
  order) for either the file search-servers.txt, containing a list of
  distributed search servers, or the directory "index" containing
  merged indexes, or the directory "segments" containing segment
  indexes.
  </description>
</property>

<!-- urlfilter properties -->
<property>
  <name>urlfilter.class</name>
  <value>net.nutch.net.RegexURLFilter</value>
  <description>Name of the class used to filter URLs.</description>
</property>

<property>
  <name>urlfilter.regex.file</name>
  <value>regex-urlfilter-default.txt</value>
  <description>Name of file on CLASSPATH containing default regular
  expressions used by RegexURLFilter.</description>
</property>

<!-- plugin properties -->

<property>
  <name>plugin.folder</name>
  <value>plugins</value>
  <description>A Directory where nutch plugin are located</description>
</property>

</nutch-conf>

--- NEW FILE: .cvsignore ---
nutch-site.xml

--- NEW FILE: banned-hosts.txt ---
# list domains you don't want to crawl (by suffix) in this file.
# hash-marks indicate comments, and lines will be whitespace trimmed.
# list only one domain per line.

--- NEW FILE: nutch-conf.xsl ---
<?xml version="1.0"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"; version="1.0">
<xsl:output method="html"/>
<xsl:template match="nutch-conf">
<html>
<body>
<table border="1">
<tr>
 <td>name</td>
 <td>value</td>
 <td>description</td>
</tr>
<xsl:for-each select="property">
<tr>
  <td><xsl:value-of select="name"/></td>
  <td><xsl:value-of select="value"/></td>
  <td><xsl:value-of select="description"/></td>
</tr>
</xsl:for-each>
</table>
</body>
</html>
</xsl:template>
</xsl:stylesheet>

--- NEW FILE: regex-urlfilter-default.txt ---
# The default config file for RegexURLFilter.

# Each non-comment, non-blank line contains a regular expression
# prefixed by '+' or '-'.  The first matching pattern in the file
# determines whether a URL is included or ignored.  If no pattern
# matches, the URL is ignored.

# Override this file by creating a new file, copying this file.  Then,
# in nutch-site.xml, set urlfilter.regex.file to the name of this new
# file.  (If nutch-site.xml does not exist, create it.  It has the
# same format as nutch-default.xml, and specifies overrides.)

# skip 'file:' urls
-^file:

# skip image and other suffixes we can't yet parse
-\.(gif|GIF|jpg|JPG|pdf|PDF|ico|ICO|css|css)$

# skip URLs containing certain characters as probable queries, etc.
[EMAIL PROTECTED]

# accept anything else
+.

--- NEW FILE: common-terms.utf8 ---
# Common terms and phrases which will be indexed in n-grams
# in order to optimize search.
content:a
content:and
content:for
content:in
content:of
content:the
content:to
url:com
url:http
url:http-www
url:www



-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] playground/conf nutch-default.xml,NONE,1.1 .cvsignore,NONE,1.1 banned-hosts.txt,NONE,1.1 nutch-conf.xsl,NONE,1.1 regex-urlfilter-default.txt,NONE,1.1 common-terms.utf8,NONE,1.1

Reply via email to