Added: nutch/cms_site/trunk/content/apidocs/apidocs-1.18/resources/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/apidocs/apidocs-1.18/resources/nutch-default.xml?rev=1889891&view=auto ============================================================================== --- nutch/cms_site/trunk/content/apidocs/apidocs-1.18/resources/nutch-default.xml (added) +++ nutch/cms_site/trunk/content/apidocs/apidocs-1.18/resources/nutch-default.xml Fri May 14 10:08:55 2021 @@ -0,0 +1,2764 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- Do not modify this file directly. Instead, copy entries that you --> +<!-- wish to modify from this file into nutch-site.xml and change them --> +<!-- there. If nutch-site.xml does not already exist, create it. --> + +<configuration> + +<!-- general properties --> + +<property> + <name>store.ip.address</name> + <value>false</value> + <description>Enables us to capture the specific IP address + (InetSocketAddress) of the host which we connect to via + the given protocol. Currently supported is protocol-ftp and + http. + </description> +</property> + +<!-- file properties --> + +<property> + <name>file.content.limit</name> + <value>1048576</value> + <description>The length limit for downloaded content using the file:// + protocol, in bytes. If this value is non-negative (>=0), content longer + than it will be truncated; otherwise, no truncation at all. Do not + confuse this setting with the http.content.limit setting. + </description> +</property> + +<property> + <name>file.crawl.parent</name> + <value>true</value> + <description>The crawler is not restricted to the directories that you specified in the + URLs file but it is jumping into the parent directories as well. For your own crawlings you can + change this behavior (set to false) the way that only directories beneath the directories that you specify get + crawled.</description> +</property> + +<property> + <name>file.crawl.redirect_noncanonical</name> + <value>true</value> + <description> + If true, protocol-file treats non-canonical file names as + redirects and does not canonicalize file names internally. A file + name containing symbolic links as path elements is then not + resolved and "fetched" but recorded as redirect with the + canonical name (all links on path are resolved) as redirect + target. + </description> +</property> + +<property> + <name>file.content.ignored</name> + <value>true</value> + <description>If true, no file content will be saved during fetch. + And it is probably what we want to set most of time, since file:// URLs + are meant to be local and we can always use them directly at parsing + and indexing stages. Otherwise file contents will be saved. + !! NOT IMPLEMENTED YET !! + </description> +</property> + +<!-- HTTP properties --> + +<property> + <name>http.agent.name</name> + <value></value> + <description>HTTP 'User-Agent' request header. MUST NOT be empty - + please set this to a single word uniquely related to your organization. + + NOTE: You should also check other related properties: + + http.robots.agents + http.agent.description + http.agent.url + http.agent.email + http.agent.version + + and set their values appropriately. + + </description> +</property> + +<property> + <name>http.robots.agents</name> + <value></value> + <description>Any other agents, apart from 'http.agent.name', that the robots + parser would look for in robots.txt. Multiple agents can be provided using + comma as a delimiter. eg. mybot,foo-spider,bar-crawler + + The ordering of agents does NOT matter and the robots parser would make + decision based on the agent which matches first to the robots rules. + Also, there is NO need to add a wildcard (ie. "*") to this string as the + robots parser would smartly take care of a no-match situation. + + If no value is specified, by default HTTP agent (ie. 'http.agent.name') + would be used for user agent matching by the robots parser. + </description> +</property> + +<property> + <name>http.robot.rules.whitelist</name> + <value></value> + <description>Comma separated list of hostnames or IP addresses to ignore + robot rules parsing for. Use with care and only if you are explicitly + allowed by the site owner to ignore the site's robots.txt! + </description> +</property> + +<property> + <name>http.robots.403.allow</name> + <value>true</value> + <description>Some servers return HTTP status 403 (Forbidden) if + /robots.txt doesn't exist. This should probably mean that we are + allowed to crawl the site nonetheless. If this is set to false, + then such sites will be treated as forbidden.</description> +</property> + +<property> + <name>http.agent.description</name> + <value></value> + <description>Further description of our bot- this text is used in + the User-Agent header. It appears in parenthesis after the agent name. + </description> +</property> + +<property> + <name>http.agent.url</name> + <value></value> + <description>A URL to advertise in the User-Agent header. This will + appear in parenthesis after the agent name. Custom dictates that this + should be a URL of a page explaining the purpose and behavior of this + crawler. + </description> +</property> + +<property> + <name>http.agent.email</name> + <value></value> + <description>An email address to advertise in the HTTP 'From' request + header and User-Agent header. A good practice is to mangle this + address (e.g. 'info at example dot com') to avoid spamming. + </description> +</property> + +<property> + <name>http.agent.version</name> + <value>Nutch-1.18</value> + <description>A version string to advertise in the User-Agent + header.</description> +</property> + +<property> + <name>http.agent.rotate</name> + <value>false</value> + <description> + If true, instead of http.agent.name, alternating agent names are + chosen from a list provided via http.agent.rotate.file. + </description> +</property> + +<property> + <name>http.agent.rotate.file</name> + <value>agents.txt</value> + <description> + File containing alternative user agent names to be used instead of + http.agent.name on a rotating basis if http.agent.rotate is true. + Each line of the file should contain exactly one agent + specification including name, version, description, URL, etc. + </description> +</property> + +<property> + <name>http.agent.host.cookie.file</name> + <value>cookies.txt</value> + <description> + File containing per-host configured cookies. + </description> +</property> + +<property> + <name>http.agent.host</name> + <value></value> + <description>Name or IP address of the host on which the Nutch crawler + would be running. Currently this is used by 'protocol-httpclient' + plugin. + </description> +</property> + +<property> + <name>http.timeout</name> + <value>10000</value> + <description>The default network timeout, in milliseconds.</description> +</property> + +<property> + <name>http.content.limit</name> + <value>1048576</value> + <description>The length limit for downloaded content using the http/https + protocols, in bytes. If this value is non-negative (>=0), content longer + than it will be truncated; otherwise, no truncation at all. Do not + confuse this setting with the file.content.limit setting. + </description> +</property> + +<property> + <name>http.time.limit</name> + <value>-1</value> + <description>The time limit in seconds to fetch a single document. + If this value is non-negative (>=0), the HTTP protocol implementation + will stop reading from a socket after http.time.limit seconds have + been spent for fetching this document. The HTTP response is then + marked as truncated. The http.time.limit should be set to a longer + time period than http.timeout, as it applies to the entire duration + to fetch a document, not only the network timeout of a single I/O + operation. Note: supported only by protocol-okhttp. + </description> +</property> + +<property> + <name>http.partial.truncated</name> + <value>false</value> + <description> + If true the HTTP protocol implementation may store the content of + partial fetches and mark the response as truncated instead of + throwing an exception which will cause the fetch to fail. This + allows to use the data which has already been fetched, instead of + retrying the fetch later. Note: supported only by protocol-okhttp. + </description> +</property> + +<property> + <name>http.tls.certificates.check</name> + <value>false</value> + <description> + Whether to check the TLS/SSL server certificates for validity. + If true invalid (e.g., self-signed or expired) certificates are + rejected and the https connection is failed. If false insecure + TLS/SSL connections are allowed. Note that this property is + currently not supported by all http/https protocol plugins. + </description> +</property> + +<property> + <name>http.proxy.host</name> + <value></value> + <description>The proxy hostname. If empty, no proxy is used.</description> +</property> + +<property> + <name>http.proxy.port</name> + <value></value> + <description>The proxy port.</description> +</property> + +<property> + <name>http.proxy.username</name> + <value></value> + <description>Username for proxy. This will be used by + 'protocol-httpclient', if the proxy server requests basic, digest + and/or NTLM authentication. To use this, 'protocol-httpclient' must + be present in the value of 'plugin.includes' property. + NOTE: For NTLM authentication, do not prefix the username with the + domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect. + </description> +</property> + +<property> + <name>http.proxy.password</name> + <value></value> + <description>Password for proxy. This will be used by + 'protocol-httpclient', if the proxy server requests basic, digest + and/or NTLM authentication. To use this, 'protocol-httpclient' must + be present in the value of 'plugin.includes' property. + </description> +</property> + +<property> + <name>http.proxy.realm</name> + <value></value> + <description>Authentication realm for proxy. Do not define a value + if realm is not required or authentication should take place for any + realm. NTLM does not use the notion of realms. Specify the domain name + of NTLM authentication as the value for this property. To use this, + 'protocol-httpclient' must be present in the value of + 'plugin.includes' property. + </description> +</property> + +<property> + <name>http.auth.file</name> + <value>httpclient-auth.xml</value> + <description>Authentication configuration file for + 'protocol-httpclient' plugin. + </description> +</property> + +<property> + <name>http.proxy.type</name> + <value>HTTP</value> + <description> + Proxy type: HTTP or SOCKS (cf. java.net.Proxy.Type). + Note: supported by protocol-okhttp. + </description> +</property> + +<property> + <name>http.proxy.exception.list</name> + <value></value> + <description>A comma separated list of hosts that don't use the proxy + (e.g. intranets). Example: www.apache.org</description> +</property> + +<property> + <name>http.useHttp11</name> + <value>true</value> + <description> + If true, use HTTP 1.1, if false use HTTP 1.0 . + </description> +</property> + +<property> + <name>http.useHttp2</name> + <value>false</value> + <description> + If true try HTTP/2 and fall-back to HTTP/1.1 if HTTP/2 not + supported, if false use always HTTP/1.1. + + NOTE: HTTP/2 is currently only supported by protocol-okhttp and + requires at runtime Java 9 or a modified Java 8 with support for + ALPN (Application Layer Protocol Negotiation). + </description> +</property> + +<property> + <name>http.accept.language</name> + <value>en-us,en-gb,en;q=0.7,*;q=0.3</value> + <description>Value of the "Accept-Language" request header field. + This allows selecting non-English language as default one to retrieve. + It is a useful setting for search engines build for certain national group. + To send requests without "Accept-Language" header field, thi property must + be configured to contain a space character because an empty property does + not overwrite the default. + </description> +</property> + +<property> + <name>http.accept</name> + <value>text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value> + <description>Value of the "Accept" request header field. A space character + as value will cause that no "Accept" header field is sent in the request. + </description> +</property> + +<property> + <name>http.accept.charset</name> + <value>utf-8,iso-8859-1;q=0.7,*;q=0.7</value> + <description>Value of the "Accept-Charset" request header field. A space character + as value will cause that no "Accept-Charset" header field is sent in the request. + </description> +</property> + +<property> + <name>http.store.responsetime</name> + <value>true</value> + <description>Enables us to record the response time of the + host which is the time period between start connection to end + connection of a pages host. The response time in milliseconds + is stored in CrawlDb in CrawlDatum's meta data under key "_rs_" + </description> +</property> + +<property> + <name>http.enable.if.modified.since.header</name> + <value>true</value> + <description>Whether Nutch sends an HTTP If-Modified-Since header. It reduces + bandwidth when enabled by not downloading pages that respond with an HTTP + Not-Modified header. URLs that are not downloaded are not passed through + parse or indexing filters. If you regularly modify filters, you should force + Nutch to also download unmodified pages by disabling this feature. + </description> +</property> + +<property> + <name>http.enable.cookie.header</name> + <value>true</value> + <description>Whether Nutch sends an HTTP Cookie header. The cookie value + is read from the CrawlDatum Cookie metadata field. + </description> +</property> + +<!-- FTP properties --> + +<property> + <name>ftp.username</name> + <value>anonymous</value> + <description>ftp login username.</description> +</property> + +<property> + <name>ftp.password</name> + <value>[email protected]</value> + <description>ftp login password.</description> +</property> + +<property> + <name>ftp.content.limit</name> + <value>1048576</value> + <description>The length limit for downloaded content, in bytes. + If this value is non-negative (>=0), content longer than it will be truncated; + otherwise, no truncation at all. + Caution: classical ftp RFCs never defines partial transfer and, in fact, + some ftp servers out there do not handle client side forced close-down very + well. Our implementation tries its best to handle such situations smoothly. + </description> +</property> + +<property> + <name>ftp.timeout</name> + <value>60000</value> + <description>Default timeout for ftp client socket, in millisec. + Please also see ftp.keep.connection below.</description> +</property> + +<property> + <name>ftp.server.timeout</name> + <value>100000</value> + <description>An estimation of ftp server idle time, in millisec. + Typically it is 120000 millisec for many ftp servers out there. + Better be conservative here. Together with ftp.timeout, it is used to + decide if we need to delete (annihilate) current ftp.client instance and + force to start another ftp.client instance anew. This is necessary because + a fetcher thread may not be able to obtain next request from queue in time + (due to idleness) before our ftp client times out or remote server + disconnects. Used only when ftp.keep.connection is true (please see below). + </description> +</property> + +<property> + <name>ftp.keep.connection</name> + <value>false</value> + <description>Whether to keep ftp connection. Useful if crawling same host + again and again. When set to true, it avoids connection, login and dir list + parser setup for subsequent URLs. If it is set to true, however, you must + make sure (roughly): + (1) ftp.timeout is less than ftp.server.timeout + (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay) + Otherwise there will be too many "delete client because idled too long" + messages in thread logs.</description> +</property> + +<property> + <name>ftp.follow.talk</name> + <value>false</value> + <description>Whether to log dialogue between our client and remote + server. Useful for debugging.</description> +</property> + +<!-- web db properties --> +<property> + <name>db.fetch.interval.default</name> + <value>2592000</value> + <description>The default number of seconds between re-fetches of a page (30 days). + </description> +</property> + +<property> + <name>db.fetch.interval.max</name> + <value>7776000</value> + <description>The maximum number of seconds between re-fetches of a page + (90 days). After this period every page in the db will be re-tried, no + matter what is its status. + </description> +</property> + +<property> + <name>db.fetch.schedule.class</name> + <value>org.apache.nutch.crawl.DefaultFetchSchedule</value> + <description>The implementation of fetch schedule. DefaultFetchSchedule simply + adds the original fetchInterval to the last fetch time, regardless of + page changes, whereas AdaptiveFetchSchedule (see below) tries to adapt + to the rate at which a given page is changed. + </description> +</property> + +<property> + <name>db.fetch.schedule.adaptive.inc_rate</name> + <value>0.4</value> + <description>If a page is unmodified, its fetchInterval will be + increased by this rate. This value should not + exceed 0.5, otherwise the algorithm becomes unstable.</description> +</property> + +<property> + <name>db.fetch.schedule.adaptive.dec_rate</name> + <value>0.2</value> + <description>If a page is modified, its fetchInterval will be + decreased by this rate. This value should not + exceed 0.5, otherwise the algorithm becomes unstable.</description> +</property> + +<property> + <name>db.fetch.schedule.adaptive.min_interval</name> + <value>60.0</value> + <description>Minimum fetchInterval, in seconds.</description> +</property> + +<property> + <name>db.fetch.schedule.adaptive.max_interval</name> + <value>31536000.0</value> + <description>Maximum fetchInterval, in seconds (365 days). + NOTE: this is limited by db.fetch.interval.max. Pages with + fetchInterval larger than db.fetch.interval.max + will be fetched anyway.</description> +</property> + +<property> + <name>db.fetch.schedule.adaptive.sync_delta</name> + <value>true</value> + <description>If true, try to synchronize with the time of page change. + by shifting the next fetchTime by a fraction (sync_rate) of the difference + between the last modification time, and the last fetch time.</description> +</property> + +<property> + <name>db.fetch.schedule.adaptive.sync_delta_rate</name> + <value>0.3</value> + <description>See sync_delta for description. This value should not + exceed 0.5, otherwise the algorithm becomes unstable.</description> +</property> + +<property> + <name>db.fetch.schedule.mime.file</name> + <value>adaptive-mimetypes.txt</value> + <description>The configuration file for the MimeAdaptiveFetchSchedule. + </description> +</property> + +<property> + <name>db.update.additions.allowed</name> + <value>true</value> + <description>If true, updatedb will add newly discovered URLs, if false + only already existing URLs in the CrawlDb will be updated and no new + URLs will be added. + </description> +</property> + +<property> + <name>db.preserve.backup</name> + <value>true</value> + <description>If true, updatedb will keep a backup of the previous CrawlDB + version in the old directory. In case of disaster, one can rename old to + current and restore the CrawlDB to its previous state. + </description> +</property> + +<property> + <name>db.update.purge.404</name> + <value>false</value> + <description>If true, updatedb will add purge records with status DB_GONE + from the CrawlDB. + </description> +</property> + +<property> + <name>db.update.purge.orphans</name> + <value>false</value> + <description>If true, updatedb will permanently delete URLs marked + as orphan from the CrawlDb. The plugin scoring-orphan needs to be + activated to get records marked as orphan. See the plugin's options + elsewhere in this document. + </description> +</property> + +<property> + <name>crawldb.url.normalizers</name> + <value>false</value> + <description> + !Temporary, can be overwritten with the command line! + Normalize URLs when updating crawldb + </description> +</property> + +<property> + <name>crawldb.url.filters</name> + <value>false</value> + <description> + !Temporary, can be overwritten with the command line! + Filter URLS when updating crawldb + </description> +</property> + +<property> + <name>db.update.max.inlinks</name> + <value>10000</value> + <description>Maximum number of inlinks to take into account when updating + a URL score in the crawlDB. Only the best scoring inlinks are kept. + </description> +</property> + +<property> + <name>db.ignore.internal.links</name> + <value>false</value> + <description>If true, outlinks leading from a page to internal hosts or domain + will be ignored. This is an effective way to limit the crawl to include + only initially injected hosts or domains, without creating complex URLFilters. + See 'db.ignore.external.links.mode'. + </description> +</property> + +<property> + <name>db.ignore.external.links</name> + <value>false</value> + <description>If true, outlinks leading from a page to external hosts or domain + will be ignored. This is an effective way to limit the crawl to include + only initially injected hosts or domains, without creating complex URLFilters. + See 'db.ignore.external.links.mode'. + </description> +</property> + +<property> + <name>db.ignore.also.redirects</name> + <value>true</value> + <description>If true, the fetcher checks redirects the same way as + links when ignoring internal or external links. Set to false to + follow redirects despite the values for db.ignore.external.links and + db.ignore.internal.links. + </description> +</property> + +<property> + <name>db.ignore.external.links.mode</name> + <value>byHost</value> + <description>Alternative value is byDomain</description> +</property> + + <property> + <name>db.ignore.external.exemptions.file</name> + <value>db-ignore-external-exemptions.txt</value> + <description> + This file contains exemption rules used by 'urlfiter-ignoreexempt' plugin + </description> +</property> + +<property> + <name>db.injector.overwrite</name> + <value>false</value> + <description>Whether existing records in the CrawlDB will be overwritten + by injected records. + </description> +</property> + +<property> + <name>db.injector.update</name> + <value>false</value> + <description>If true existing records in the CrawlDB will be updated with + injected records. Old meta data is preserved. The db.injector.overwrite + parameter has precedence. + </description> +</property> + +<property> + <name>db.score.injected</name> + <value>1.0</value> + <description>The score of new pages added by the injector. + </description> +</property> + +<property> + <name>db.score.link.external</name> + <value>1.0</value> + <description>The score factor for new pages added due to a link from + another host relative to the referencing page's score. Scoring plugins + may use this value to affect initial scores of external links. + </description> +</property> + +<property> + <name>db.score.link.internal</name> + <value>1.0</value> + <description>The score factor for pages added due to a link from the + same host, relative to the referencing page's score. Scoring plugins + may use this value to affect initial scores of internal links. + </description> +</property> + +<property> + <name>db.score.count.filtered</name> + <value>false</value> + <description>The score value passed to newly discovered pages is + calculated as a fraction of the original page score divided by the + number of outlinks. If this option is false, only the outlinks that passed + URLFilters will count, if it's true then all outlinks will count. + </description> +</property> + +<property> + <name>db.max.outlinks.per.page</name> + <value>100</value> + <description>The maximum number of outlinks that we'll process for a page. + If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks + will be processed for a page; otherwise, all outlinks will be processed. + </description> +</property> + +<property> + <name>db.max.outlink.length</name> + <value>4096</value> + <description> + The maximum length in characters accepted for outlinks before + applying URL normalizers and filters. If this value is + nonnegative (>=0), only URLs with a length in characters less or + equal than db.max.outlink.length are accepted and then passed to + URL normalizers and filters. Doing the length check beforehand + avoids that normalizers or filters hang up on overlong URLs. + Note: this property is only used to check URLs found as outlinks + and redirects, but not for injected URLs. + </description> +</property> + +<property> + <name>db.parsemeta.to.crawldb</name> + <value></value> + <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779). + Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' + will copy both the key 'lang' and its value to the corresponding entry in the crawldb. + </description> +</property> + +<property> + <name>db.fetch.retry.max</name> + <value>3</value> + <description>The maximum number of times a URL that has encountered + recoverable errors is generated for fetch.</description> +</property> + +<property> + <name>db.signature.class</name> + <value>org.apache.nutch.crawl.MD5Signature</value> + <description>The default implementation of a page signature. Signatures + created with this implementation will be used for duplicate detection + and removal.</description> +</property> + +<property> + <name>db.signature.text_profile.min_token_len</name> + <value>2</value> + <description>Minimum token length to be included in the signature. + </description> +</property> + +<property> + <name>db.signature.text_profile.quant_rate</name> + <value>0.01</value> + <description>Profile frequencies will be rounded down to a multiple of + QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token + frequency. If maxFreq > 1 then QUANT will be at least 2, which means that + for longer texts tokens with frequency 1 will always be discarded. + </description> +</property> + +<property> + <name>db.stats.score.quantiles</name> + <value>.01,.05,.1,.2,.25,.3,.4,.5,.6,.7,.75,.8,.9,.95,.99</value> + <description> + Quantiles of the distribution of CrawlDatum scores shown in the + CrawlDb statistics (command `readdb -stats'). Comma-separated + list of floating point numbers. + </description> +</property> + +<!-- linkdb properties --> + +<property> + <name>linkdb.max.inlinks</name> + <value>10000</value> + <description>Maximum number of inlinks per URL to be kept in LinkDb. + If "invertlinks" finds more inlinks than this number, only the first + N inlinks will be stored, and the rest will be discarded. + </description> +</property> + +<property> + <name>linkdb.ignore.internal.links</name> + <value>true</value> + <description>If true, when adding new links to a page, links from + the same host are ignored. This is an effective way to limit the + size of the link database, keeping only the highest quality + links. + </description> +</property> + +<property> + <name>linkdb.ignore.external.links</name> + <value>false</value> + <description>If true, when adding new links to a page, links from + the a different host are ignored. + </description> +</property> + +<property> + <name>linkdb.max.anchor.length</name> + <value>100</value> + <description> + The maximum number of characters permitted for anchor texts stored + in LinkDb. + </description> +</property> + +<!-- generate properties --> + +<property> + <name>generate.max.count</name> + <value>-1</value> + <description>The maximum number of URLs in a single + fetchlist. -1 if unlimited. The URLs are counted according + to the value of the parameter generate.count.mode. + </description> +</property> + +<property> + <name>generate.count.mode</name> + <value>host</value> + <description>Determines how the URLs are counted for generate.max.count. + Default value is 'host' but can be 'domain'. Note that we do not count + per IP in the new version of the Generator. + </description> +</property> + +<property> + <name>generate.update.crawldb</name> + <value>false</value> + <description>For highly-concurrent environments, where several + generate/fetch/update cycles may overlap, setting this to true ensures + that generate will create different fetchlists even without intervening + updatedb-s, at the cost of running an additional job to update CrawlDB. + If false, running generate twice without intervening updatedb will + generate identical fetchlists. See also crawl.gen.delay which defines + how long items already generated are blocked.</description> +</property> + +<property> + <name>generate.min.score</name> + <value>0</value> + <description>Select only entries with a score larger than + generate.min.score.</description> +</property> + +<property> + <name>generate.min.interval</name> + <value>-1</value> + <description>Select only entries with a retry interval lower than + generate.min.interval. A value of -1 disables this check.</description> +</property> + +<property> + <name>generate.hostdb</name> + <value></value> + <description>Path to HostDB, required for the generate.max.count.expr + and generate.fetch.delay.expr properties. + See https://issues.apache.org/jira/browse/NUTCH-2368</description> +</property> + +<property> + <name>generate.fetch.delay.expr</name> + <value></value> + <description>Controls variable fetcher.server.delay via a Jexl expression and + HostDB information. It allows you to alter fetch delay based on HostDB data. + See https://issues.apache.org/jira/browse/NUTCH-2368</description> +</property> + +<property> + <name>generate.max.count.expr</name> + <value></value> + <description>Controls variable generate.max.count via a Jexl expression and + HostDB information. It allows you to alter maxCount based on HostDB data. + See https://issues.apache.org/jira/browse/NUTCH-2368</description> +</property> + +<property> + <name>generate.restrict.status</name> + <value></value> + <description>Select only entries of this status, see + https://issues.apache.org/jira/browse/NUTCH-1248</description> +</property> + +<!-- urlpartitioner properties --> + +<property> + <name>partition.url.mode</name> + <value>byHost</value> + <description>Determines how to partition URLs. Default value is 'byHost', + also takes 'byDomain' or 'byIP'. + </description> +</property> + +<property> + <name>crawl.gen.delay</name> + <value>604800000</value> + <description> + This value, expressed in milliseconds, defines how long we should keep the lock on records + in CrawlDb that were just selected for fetching. If these records are not updated + in the meantime, the lock is canceled, i.e. they become eligible for selecting again. + Default value of this is 7 days (604800000 ms). If generate.update.crawldb is false + the property crawl.gen.delay has no effect. + </description> +</property> + +<!-- fetcher properties --> + +<property> + <name>fetcher.server.delay</name> + <value>5.0</value> + <description>The number of seconds the fetcher will delay between + successive requests to the same server. Note that this might get + overridden by a Crawl-Delay from a robots.txt and is used ONLY if + fetcher.threads.per.queue is set to 1. + </description> +</property> + +<property> + <name>fetcher.server.min.delay</name> + <value>0.0</value> + <description>The minimum number of seconds the fetcher will delay between + successive requests to the same server. This value is applicable ONLY + if fetcher.threads.per.queue is greater than 1 (i.e. the host blocking + is turned off).</description> +</property> + +<property> + <name>fetcher.max.crawl.delay</name> + <value>30</value> + <description> + If the Crawl-Delay in robots.txt is set to greater than this value (in + seconds) then the fetcher will skip this page, generating an error report. + If set to -1 the fetcher will never skip such pages and will wait the + amount of time retrieved from robots.txt Crawl-Delay, however long that + might be. + </description> +</property> + +<property> + <name>fetcher.min.crawl.delay</name> + <value>${fetcher.server.delay}</value> + <description> + Minimum Crawl-Delay (in seconds) accepted in robots.txt, even if the + robots.txt specifies a shorter delay. By default the minimum Crawl-Delay + is set to the value of `fetcher.server.delay` which guarantees that + a value set in the robots.txt cannot make the crawler more aggressive + than the default configuration. + </description> +</property> + +<property> + <name>fetcher.threads.fetch</name> + <value>10</value> + <description>The number of FetcherThreads the fetcher should use. + This is also determines the maximum number of requests that are + made at once (each FetcherThread handles one connection). The total + number of threads running in distributed mode will be the number of + fetcher threads * number of nodes as fetcher has one map task per node. + </description> +</property> + +<property> + <name>fetcher.threads.per.queue</name> + <value>1</value> + <description>This number is the maximum number of threads that + should be allowed to access a queue at one time. Setting it to + a value > 1 will cause the Crawl-Delay value from robots.txt to + be ignored and the value of fetcher.server.min.delay to be used + as a delay between successive requests to the same server instead + of fetcher.server.delay. + </description> +</property> + +<property> + <name>fetcher.queue.mode</name> + <value>byHost</value> + <description>Determines how to put URLs into queues. Default value + is 'byHost', also takes 'byDomain' or 'byIP'. Crawl delays are + implemented on the level of fetcher queues. + </description> +</property> + +<property> + <name>fetcher.verbose</name> + <value>false</value> + <description>If true, fetcher will log more verbosely.</description> +</property> + +<property> + <name>http.log.exceptions.suppress.stack</name> + <value>java.net.UnknownHostException,java.net.NoRouteToHostException</value> + <description>Comma-separated list of exceptions not shown with full + stack trace in logs of fetcher and HTTP protocol implementations. + The logs may shrink in size significantly, e.g., when for a large + unrestricted web crawl unknown hosts are logged shortly without full + stack trace. The full class name of the exception class (extending + Throwable) including the package path must be specified.</description> +</property> + +<property> + <name>fetcher.parse</name> + <value>false</value> + <description>If true, fetcher will parse content. Default is false, which means + that a separate parsing step is required after fetching is finished.</description> +</property> + +<property> + <name>fetcher.store.content</name> + <value>true</value> + <description>If true, fetcher will store content.</description> +</property> + +<property> + <name>fetcher.signature</name> + <value>false</value> + <description>If true, fetcher will generate the signature for + successfully fetched documents even if the content is not parsed by + fetcher (see property fetcher.parse). Default is false, which means + that the signature is calculated when parsing either by the fetcher + or during the parsing step. Note that a non-parsing fetcher can + only generate signatures based on the binary content and not on the + textual content. An appropriate signature class should be chosen + (see property db.signature.class). + </description> +</property> + +<property> + <name>fetcher.timelimit.mins</name> + <value>-1</value> + <description>This is the number of minutes allocated to the fetching. + Once this value is reached, any remaining entry from the input URL list is skipped + and all active queues are emptied. The default value of -1 deactivates the time limit. + </description> +</property> + +<property> + <name>fetcher.max.exceptions.per.queue</name> + <value>-1</value> + <description>The maximum number of protocol-level exceptions (e.g. timeouts) per + host (or IP) queue. Once this value is reached, any remaining entries from this + queue are purged, effectively stopping the fetching from this host/IP. The default + value of -1 deactivates this limit. + </description> +</property> + +<property> + <name>fetcher.throughput.threshold.pages</name> + <value>-1</value> + <description>The threshold of minimum pages per second. If the fetcher downloads less + pages per second than the configured threshold, the fetcher stops, preventing slow queue's + from stalling the throughput. This threshold must be an integer. This can be useful when + fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check. + </description> +</property> + +<property> + <name>fetcher.throughput.threshold.retries</name> + <value>5</value> + <description>The number of times the fetcher.throughput.threshold.pages is allowed to be exceeded. + This settings prevents accidental slow downs from immediately killing the fetcher thread. + </description> +</property> + +<property> + <name>fetcher.throughput.threshold.check.after</name> + <value>5</value> + <description>The number of minutes after which the throughput check is enabled.</description> +</property> + +<property> + <name>fetcher.threads.timeout.divisor</name> + <value>2</value> + <description>(EXPERT)The thread time-out divisor to use. By default threads have a time-out + value of mapreduce.task.timeout / 2. Increase this setting if the fetcher waits too + long before killing hanged threads. Be careful, a too high setting (+8) will most likely kill the + fetcher threads prematurely. + </description> +</property> + +<property> + <name>fetcher.queue.depth.multiplier</name> + <value>50</value> + <description>(EXPERT)The fetcher buffers the incoming URLs into queues based on the [host|domain|IP] + (see param fetcher.queue.mode). The depth of the queue is the number of threads times the value of this parameter. + A large value requires more memory but can improve the performance of the fetch when the order of the URLS in the fetch list + is not optimal. + </description> +</property> + +<property> + <name>fetcher.follow.outlinks.depth</name> + <value>-1</value> + <description>(EXPERT)When fetcher.parse is true and this value is greater than 0 the fetcher will extract outlinks + and follow until the desired depth is reached. A value of 1 means all generated pages are fetched and their first degree + outlinks are fetched and parsed too. Be careful, this feature is in itself agnostic of the state of the CrawlDB and does not + know about already fetched pages. A setting larger than 2 will most likely fetch home pages twice in the same fetch cycle. + It is highly recommended to set db.ignore.external.links to true to restrict the outlink follower to URLs within the same + domain. When disabled (false) the feature is likely to follow duplicates even when depth=1. + A value of -1 of 0 disables this feature. + </description> +</property> + +<property> + <name>fetcher.follow.outlinks.num.links</name> + <value>4</value> + <description>(EXPERT)The number of outlinks to follow when fetcher.follow.outlinks.depth is enabled. Be careful, this can multiply + the total number of pages to fetch. This works with fetcher.follow.outlinks.depth.divisor, by default settings the followed outlinks + at depth 1 is 8, not 4. + </description> +</property> + +<property> + <name>fetcher.follow.outlinks.depth.divisor</name> + <value>2</value> + <description>(EXPERT)The divisor of fetcher.follow.outlinks.num.links per fetcher.follow.outlinks.depth. This decreases the number + of outlinks to follow by increasing depth. The formula used is: outlinks = floor(divisor / depth * num.links). This prevents + exponential growth of the fetch list. + </description> +</property> + +<property> + <name>fetcher.follow.outlinks.ignore.external</name> + <value>true</value> + <description>Whether to ignore or follow external links. Set db.ignore.external.links to false and this to true to store outlinks + in the output but not follow them. If db.ignore.external.links is true this directive is ignored. + </description> +</property> + +<property> + <name>fetcher.bandwidth.target</name> + <value>-1</value> + <description>Target bandwidth in kilobits per sec for each mapper instance. This is used to adjust the number of + fetching threads automatically (up to fetcher.maxNum.threads). A value of -1 deactivates the functionality, in which case + the number of fetching threads is fixed (see fetcher.threads.fetch).</description> +</property> + +<property> + <name>fetcher.maxNum.threads</name> + <value>25</value> + <description>Max number of fetch threads allowed when using fetcher.bandwidth.target. Defaults to fetcher.threads.fetch if unspecified or + set to a value lower than it. </description> +</property> + +<property> + <name>fetcher.bandwidth.target.check.everyNSecs</name> + <value>30</value> + <description>(EXPERT) Value in seconds which determines how frequently we should reassess the optimal number of fetch threads when using + fetcher.bandwidth.target. Defaults to 30 and must be at least 1.</description> +</property> + +<property> + + <name>fetcher.store.robotstxt</name> + <value>false</value> + <description>If true (and fetcher.store.content is also true), + fetcher will store the robots.txt response content and status for + debugging or archival purposes. The robots.txt is added to the + content/ folder of the fetched segment. + </description> +</property> + +<property> + <name>fetcher.publisher</name> + <value>false</value> + <description>Set this value to true if you want to use an implementation of the Publisher/Subscriber model. Make sure to set corresponding + Publisher implementation specific properties</description> +</property> + +<property> + <name>fetcher.filter.urls</name> + <value>false</value> + <description>Whether fetcher will filter URLs (with the configured URL filters).</description> +</property> + +<property> + <name>fetcher.normalize.urls</name> + <value>false</value> + <description>Whether fetcher will normalize URLs (with the configured URL normalizers).</description> +</property> + +<property> + <name>http.redirect.max</name> + <value>0</value> + <description>The maximum number of redirects the fetcher will follow when + trying to fetch a page. If set to negative or 0, fetcher won't immediately + follow redirected URLs, instead it will record them for later fetching. + </description> +</property> + +<property> + <name>http.redirect.max.exceeded.skip</name> + <value>false</value> + <description> + Whether to skip the last URL in a redirect chain when when redirects + are followed (http.redirect.max > 0) and the maximum number of redirects + in a chain is exceeded (redirect_count > http.redirect.max). + If not skipped the redirect target URLs are stored as `linked` + and fetched in one of the following cycles. See also NUTCH-2748. + </description> +</property> + +<property> + <name>fetcher.redirect.dedupcache.seconds</name> + <value>-1</value> + <description> + The maximum time in seconds fetcher will cache redirects for + deduplication. If the same redirect URL is seen again withing + this time it is skipped. This allows to avoid pathological cases + where many or most of the URLs of a host are redirected to the + same URL, eg. a page to login, accept cookies, indicating an + error. A value less or equal zero disables redirect deduplication. + Caveat: This may break setting cookies via recursive redirect chains. + </description> +</property> + +<property> + <name>fetcher.redirect.dedupcache.size</name> + <value>1000</value> + <description> + The maximum size of the cache to deduplicate redirects, + see `fetcher.redirect.dedupcache.seconds`. + </description> +</property> + + +<!-- SegmentReader --> +<property> + <name>segment.reader.content.recode</name> + <value>false</value> + <description> + SegmentReader when dumping segments: If true try to recode content + of HTML documents from the original encoding to UTF-8. Note, this + property can be overwritten by SegmentReader command-line options. + </description> +</property> + + + +<!-- any23 plugin properties --> + +<property> + <name>any23.extractors</name> + <value>html-microdata</value> + <description>Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html)</description> +</property> + +<property> + <name>any23.content_types</name> + <value>text/html,application/xhtml+xml</value> + <description>Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported.</description> +</property> + +<!-- moreindexingfilter plugin properties --> + +<property> + <name>moreIndexingFilter.indexMimeTypeParts</name> + <value>true</value> + <description>Determines whether the index-more plugin will split the mime-type + in sub parts, this requires the type field to be multi valued. Set to true for backward + compatibility. False will not split the mime-type. + </description> +</property> + +<property> + <name>moreIndexingFilter.mapMimeTypes</name> + <value>false</value> + <description>Determines whether MIME-type mapping is enabled. It takes a + plain text file with mapped MIME-types. With it the user can map both + application/xhtml+xml and text/html to the same target MIME-type so it + can be treated equally in an index. See conf/contenttype-mapping.txt. + </description> +</property> + +<property> + <name>moreIndexingFilter.mapMimeTypes.field</name> + <value></value> + <description>It's used if moreIndexingFilter.mapMimeTypes is true. Indicates the field + where the mapped MIME-type must be written. If it's empty or unset, the content of the field "type" + will be replaced by the mapped MIME-type. + </description> +</property> + +<!-- AnchorIndexing filter plugin properties --> + +<property> + <name>anchorIndexingFilter.deduplicate</name> + <value>false</value> + <description>With this enabled the indexer will case-insensitive deduplicate anchors + before indexing. This prevents possible hundreds or thousands of identical anchors for + a given page to be indexed but will affect the search scoring (i.e. tf=1.0f). + </description> +</property> + +<!-- indexingfilter plugin properties --> + +<property> + <name>indexingfilter.order</name> + <value></value> + <description>The order by which index filters are applied. + If empty, all available index filters (as dictated by properties + plugin-includes and plugin-excludes above) are loaded and applied in system + defined order. If not empty, only named filters are loaded and applied + in given order. For example, if this property has value: + org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter + then BasicIndexingFilter is applied first, and MoreIndexingFilter second. + + Filter ordering might have impact on result if one filter depends on output of + another filter. + </description> +</property> + +<property> + <name>indexer.score.power</name> + <value>0.5</value> + <description>Determines the power of link analyis scores. The boost + of each page is set to <i>score<sup>scorePower</sup></i> where + <i>score</i> is its link analysis score and <i>scorePower</i> is the + value of this parameter. This is compiled into indexes, so, when + this is changed, pages must be re-indexed for it to take + effect.</description> +</property> + +<property> + <name>indexer.max.title.length</name> + <value>100</value> + <description>The maximum number of characters of a title that are indexed. A value of -1 disables this check. + </description> +</property> + +<property> + <name>indexer.max.content.length</name> + <value>-1</value> + <description>The maximum number of characters of a content that are indexed. + Content beyond the limit is truncated. A value of -1 disables this check. + </description> +</property> + +<property> + <name>indexer.add.domain</name> + <value>false</value> + <description>Whether to add the domain field to a NutchDocument.</description> +</property> + +<property> + <name>indexer.skip.notmodified</name> + <value>false</value> + <description>Whether the indexer will skip records with a db_notmodified status. + </description> +</property> + +<property> + <name>indexer.delete.robots.noindex</name> + <value>false</value> + <description>Whether the indexer will delete documents marked by robots=noindex + </description> +</property> + +<property> + <name>indexer.delete.skipped.by.indexingfilter</name> + <value>false</value> + <description>Whether the indexer will delete documents that were skipped by indexing filters + </description> +</property> + +<property> + <name>indexer.indexwriters.file</name> + <value>index-writers.xml</value> + <description>The configuration file for index writers.</description> +</property> + +<!-- Exchanges properties --> + +<property> + <name>exchanges.exchanges.file</name> + <value>exchanges.xml</value> + <description>The configuration file used by the Exchange component.</description> +</property> + +<!-- URL normalizer properties --> + +<property> + <name>urlnormalizer.order</name> + <value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value> + <description>Order in which normalizers will run. If any of these isn't + activated it will be silently skipped. If other normalizers not on the + list are activated, they will run in random order after the ones + specified here are run. + </description> +</property> + +<property> + <name>urlnormalizer.regex.file</name> + <value>regex-normalize.xml</value> + <description>Name of the config file used by the RegexUrlNormalizer class. + </description> +</property> + +<property> + <name>urlnormalizer.loop.count</name> + <value>1</value> + <description>Optionally loop through normalizers several times, to make + sure that all transformations have been performed. + </description> +</property> + +<property> + <name>urlnormalizer.basic.host.idn</name> + <value></value> + <description>Let urlnormalizer-basic + (org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer) + normalize Internationalized Domain Names (IDNs). Possible values + are: `toAscii` - convert the Unicode form to the ASCII (Punycode) + representation, `toUnicode` - convert ASCII (Punycode) to Unicode, + or if left empty no normalization of IDNs is performed. + </description> +</property> + +<property> + <name>urlnormalizer.basic.host.trim-trailing-dot</name> + <value>false</value> + <description>urlnormalizer-basic: Trim a trailing dot in host names: + `https://example.org./` is normalized to `https://example.org/`. + </description> +</property> + +<!-- mime properties --> + +<!-- +<property> + <name>mime.types.file</name> + <value>tika-mimetypes.xml</value> + <description>Name of file in CLASSPATH containing filename extension and + magic sequence to mime types mapping information. Overrides the default Tika config + if specified. + </description> +</property> +--> + +<property> + <name>mime.type.magic</name> + <value>true</value> + <description>Defines if the mime content type detector uses magic resolution. + </description> +</property> + +<!-- plugin properties --> + +<property> + <name>plugin.folders</name> + <value>plugins</value> + <description>Directories where Nutch plugins are located. Each + element may be a relative or absolute path. If absolute, it is used + as is. If relative, it is searched for on the classpath.</description> +</property> + +<property> + <name>plugin.auto-activation</name> + <value>true</value> + <description>Defines if some plugins that are not activated regarding + the plugin.includes and plugin.excludes properties must be automatically + activated if they are needed by some active plugins. + </description> +</property> + +<property> + <name>plugin.includes</name> + <value>protocol-http|urlfilter-(regex|validator)|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value> + <description>Regular expression naming plugin directory names to + include. Any plugin not matching this expression is excluded. + By default Nutch includes plugins to crawl HTML and various other + document formats via HTTP/HTTPS and indexing the crawled content + into Solr. More plugins are available to support more indexing + backends, to fetch ftp:// and file:// URLs, for focused crawling, + and many other use cases. + </description> +</property> + +<property> + <name>plugin.excludes</name> + <value></value> + <description>Regular expression naming plugin directory names to exclude. + </description> +</property> + +<property> + <name>urlmeta.tags</name> + <value></value> + <description> + To be used in conjunction with features introduced in NUTCH-655, which allows + for custom metatags to be injected alongside your crawl URLs. Specifying those + custom tags here will allow for their propagation into a pages outlinks, as + well as allow for them to be included as part of an index. + Values should be comma-delimited. ("tag1,tag2,tag3") Do not pad the tags with + white-space at their boundaries, if you are using anything earlier than Hadoop-0.21. + </description> +</property> + +<!-- parser properties --> + +<property> + <name>parse.plugin.file</name> + <value>parse-plugins.xml</value> + <description>The name of the file that defines the associations between + content-types and parsers.</description> +</property> + +<property> + <name>parser.character.encoding.default</name> + <value>windows-1252</value> + <description>The character encoding to fall back to when no other information + is available</description> +</property> + +<property> + <name>encodingdetector.charset.min.confidence</name> + <value>-1</value> + <description>A integer between 0-100 indicating minimum confidence value + for charset auto-detection. Any negative value disables auto-detection. + </description> +</property> + +<property> + <name>parser.caching.forbidden.policy</name> + <value>content</value> + <description>If a site (or a page) requests through its robot metatags + that it should not be shown as cached content, apply this policy. Currently + three keywords are recognized: "none" ignores any "noarchive" directives. + "content" doesn't show the content, but shows summaries (snippets). + "all" doesn't show either content or summaries.</description> +</property> + +<property> + <name>parser.html.impl</name> + <value>neko</value> + <description>HTML Parser implementation. Currently the following keywords + are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup. + </description> +</property> + +<property> + <name>parser.html.form.use_action</name> + <value>false</value> + <description>If true, HTML parser will collect URLs from form action + attributes. This may lead to undesirable behavior (submitting empty + forms during next fetch cycle). If false, form action attribute will + be ignored.</description> +</property> + +<property> + <name>parser.html.outlinks.ignore_tags</name> + <value></value> + <description>Comma separated list of HTML tags, from which outlinks + shouldn't be extracted. Nutch takes links from: a, area, form, frame, + iframe, script, link, img. If you add any of those tags here, it + won't be taken. Default is empty list. Probably reasonable value + for most people would be "img,script,link".</description> +</property> + +<property> + <name>parser.html.outlinks.htmlnode_metadata_name</name> + <value></value> + <description>if not empty, the source nodename of a found outlink will + be set in the metadata with this name into the outlink</description> +</property> + +<property> + <name>parser.html.line.separators</name> + <value>article,aside,blockquote,canvas,dd,div,dl,dt,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hr,li,main,nav,noscript,ol,output,p,pre,section,table,tfoot,ul,video</value> + <description>Comma separated list of HTML tags. Newline will be added to the + parsed text after these tages. + The default list above are the block-level HTML elements. + Tags must be in lower case. + To disable this feature, leave the list empty.</description> +</property> + +<property> + <name>htmlparsefilter.order</name> + <value></value> + <description>The order by which HTMLParse filters are applied. + If empty, all available HTMLParse filters (as dictated by properties + plugin-includes and plugin-excludes above) are loaded and applied in system + defined order. If not empty, only named filters are loaded and applied + in given order. + HTMLParse filter ordering MAY have an impact + on end result, as some filters could rely on the metadata generated by a previous filter. + </description> +</property> + +<property> + <name>parsefilter.naivebayes.trainfile</name> + <value>naivebayes-train.txt</value> + <description>Set the name of the file to be used for Naive Bayes training. The format will be: +Each line contains two tab separated parts +There are two columns/parts: +1. "1" or "0", "1" for relevant and "0" for irrelevant documents. +2. Text (text that will be used for training) + +Each row will be considered a new "document" for the classifier. +CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this classifier. + </description> +</property> + +<property> + <name>parsefilter.naivebayes.wordlist</name> + <value>naivebayes-wordlist.txt</value> + <description>Put the name of the file you want to be used as a list of + important words to be matched in the url for the model filter. The format should be one word per line. + </description> +</property> + +<property> + <name>parser.timeout</name> + <value>30</value> + <description>Timeout in seconds for the parsing of a document, otherwise treats it as an exception and + moves on the the following documents. This parameter is applied to any Parser implementation. + Set to -1 to deactivate, bearing in mind that this could cause + the parsing to crash because of a very long or corrupted document. + </description> +</property> + +<property> + <name>parse.filter.urls</name> + <value>true</value> + <description>Whether the parser will filter URLs (with the configured URL filters).</description> +</property> + +<property> + <name>parse.normalize.urls</name> + <value>true</value> + <description>Whether the parser will normalize URLs (with the configured URL normalizers).</description> +</property> + +<property> + <name>parser.skip.truncated</name> + <value>true</value> + <description>Boolean value for whether we should skip parsing for truncated documents. By default this + property is activated due to extremely high levels of CPU which parsing can sometimes take. + </description> +</property> + +<property> + <name>parser.store.text</name> + <value>true</value> + <description>If true (default value), parser will store parse text (parse_text directory within the segment).</description> +</property> + + +<!-- +<property> + <name>tika.htmlmapper.classname</name> + <value>org.apache.tika.parser.html.IdentityHtmlMapper</value> + <description>Classname of Tika HTMLMapper to use. Influences the elements included in the DOM and hence + the behavior of the HTMLParseFilters. + </description> +</property> +--> + +<property> + <name>tika.config.file</name> + <value>tika-config.xml</value> + <description>Nutch-specific Tika config file</description> +</property> + +<property> + <name>tika.uppercase.element.names</name> + <value>true</value> + <description>Determines whether TikaParser should uppercase the element name while generating the DOM + for a page, as done by Neko (used per default by parse-html)(see NUTCH-1592). + </description> +</property> + +<property> + <name>tika.extractor</name> + <value>none</value> + <description> + Which text extraction algorithm to use. Valid values are: boilerpipe or none. + </description> +</property> + +<property> + <name>tika.extractor.boilerpipe.algorithm</name> + <value>ArticleExtractor</value> + <description> + Which Boilerpipe algorithm to use. Valid values are: DefaultExtractor, ArticleExtractor + or CanolaExtractor. + </description> +</property> + +<property> + <name>tika.extractor.boilerpipe.mime.types</name> + <value>text/html,application/xhtml+xml</value> + <description> + Comma-separated list of MIME types accepted for Boilerpipe extraction, + documents of other MIME types are not passed to the Boilerpipe extractor. + </description> +</property> + +<property> + <name>tika.parse.embedded</name> + <value>true</value> + <description> + Whether parse-tika shall parse embedded documents (even recursively). + </description> +</property> + +<!-- urlfilter plugin properties --> + +<property> + <name>urlfilter.domain.file</name> + <value>domain-urlfilter.txt</value> + <description>Name of file on CLASSPATH containing either top level domains or + hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description> +</property> + +<property> + <name>urlfilter.regex.file</name> + <value>regex-urlfilter.txt</value> + <description>Name of file on CLASSPATH containing regular expressions + used by urlfilter-regex (RegexURLFilter) plugin.</description> +</property> + +<property> + <name>urlfilter.automaton.file</name> + <value>automaton-urlfilter.txt</value> + <description>Name of file on CLASSPATH containing regular expressions + used by urlfilter-automaton (AutomatonURLFilter) plugin.</description> +</property> + +<property> + <name>urlfilter.prefix.file</name> + <value>prefix-urlfilter.txt</value> + <description>Name of file on CLASSPATH containing url prefixes + used by urlfilter-prefix (PrefixURLFilter) plugin.</description> +</property> + +<property> + <name>urlfilter.suffix.file</name> + <value>suffix-urlfilter.txt</value> + <description>Name of file on CLASSPATH containing url suffixes + used by urlfilter-suffix (SuffixURLFilter) plugin.</description> +</property> + +<property> + <name>urlfilter.fast.file</name> + <value>fast-urlfilter.txt</value> + <description>Name of file on CLASSPATH containing regular expressions + used by urlfilter-fast (FastURLFilter) plugin.</description> +</property> + +<property> + <name>urlfilter.order</name> + <value></value> + <description>The order by which URL filters are applied. + If empty, all available url filters (as dictated by properties + plugin-includes and plugin-excludes above) are loaded and applied in system + defined order. If not empty, only named filters are loaded and applied + in given order. For example, if this property has value: + org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter + then RegexURLFilter is applied first, and PrefixURLFilter second. + Since all filters are AND'ed, filter ordering does not have impact + on end result, but it may have performance implication, depending + on relative expensiveness of filters. + </description> +</property> + +<!-- scoring filters properties --> + +<property> + <name>scoring.filter.order</name> + <value></value> + <description>The order in which scoring filters are applied. This + may be left empty (in which case all available scoring filters will + be applied in system defined order), or a space separated list of + implementation classes. + </description> +</property> + +<!-- scoring-depth properties + Add 'scoring-depth' to the list of active plugins + in the parameter 'plugin.includes' in order to use it. + --> + +<property> + <name>scoring.depth.max</name> + <value>1000</value> + <description>Max depth value from seed allowed by default. + Can be overridden on a per-seed basis by specifying "_maxdepth_=VALUE" + as a seed metadata. This plugin adds a "_depth_" metadatum to the pages + to track the distance from the seed it was found from. + The depth is used to prioritise URLs in the generation step so that + shallower pages are fetched first. + </description> +</property> + +<!-- scoring similarity properties +Add scoring-similarity to the list of active plugins + in the parameter 'plugin.includes' in order to use it. +For more detailed information on the working of this filter +visit https://cwiki.apache.org/confluence/display/NUTCH/SimilarityScoringFilter --> + +<property> + <name>scoring.similarity.model</name> + <value>cosine</value> + <description>The type of similarity metric to use. Eg - cosine (which is, currently, the only available model). + Please make sure to set the model specific properties for the scoring to function properly. + Description of these properties can be found on the wiki. + </description> +</property> + + <property> + <name>scoring.similarity.ngrams</name> + <value>1,1</value> + <description>Specifies the min 'n' and max 'n' in ngrams as comma-separated. + If one value is specified as 'n', it will be used for both the min 'n' and max 'n' in ngrams. + </description> +</property> + +<property> + <name>cosine.goldstandard.file</name> + <value>goldstandard.txt</value> + <description>Path to the gold standard file which contains all the relevant text and terms, + pertaining to the domain. + </description> +</property> + + <property> + <name>scoring.similarity.stopword.file</name> + <value>stopwords.txt</value> + <description>Name of the stopword text file. The user can specify a custom list of stop words + in a text file. Each new stopword should be on a new line. + </description> +</property> + +<!-- scoring filter orphan properties --> + +<property> + <name>scoring.orphan.mark.gone.after</name> + <value>2592000</value> + <description>Time in seconds after which orphaned + pages are marked as gone. Default is 30 days. + </description> +</property> + +<property> + <name>scoring.orphan.mark.orphan.after</name> + <value>3456000</value> + <description>Time in seconds after which orphaned + pages are marked as gone. Default is 40 days. + </description> +</property> + +<!-- language-identifier plugin properties --> + +<property> + <name>lang.analyze.max.length</name> + <value>2048</value> + <description> The maximum number of bytes used to identify + the language (0 means full content analysis). + The larger is this value, the better is the analysis, but the + slowest it is. + </description> +</property> + +<property> + <name>lang.extraction.policy</name> + <value>detect,identify</value> + <description>This determines when the plugin uses detection and + statistical identification mechanisms. The order in which the + detect and identify are written will determine the extraction + policy. Default case (detect,identify) means the plugin will + first try to extract language info from page headers and metadata, + if this is not successful it will try using tika language + identification. Possible values are: + detect + identify + detect,identify + identify,detect + </description> +</property> + +<property> + <name>lang.identification.only.certain</name> + <value>false</value> + <description>If set to true with lang.extraction.policy containing identify, + the language code returned by Tika will be assigned to the document ONLY + if it is deemed certain by Tika. + </description> +</property> + +<property> + <name>lang.index.languages</name> + <value></value> + <description>If not empty, should be a comma separated list of language codes. + Only documents with one of these language codes will be indexed. + "unknown" is a valid language code, will match documents where language + detection failed. + </description> +</property> + +<!-- index-jexl-filter plugin properties --> + +<property> + <name>index.jexl.filter</name> + <value></value> + <description> A JEXL expression. If it evaluates to false, + the document will not be indexed. + Available primitives in the JEXL context: + * status, fetchTime, modifiedTime, retries, interval, score, signature, url, text, title + Available objects in the JEXL context: + * httpStatus - contains majorCode, minorCode, message + * documentMeta, contentMeta, parseMeta - contain all the Metadata properties. + each property value is always an array of Strings (so if you expect one value, use [0]) + * doc - contains all the NutchFields from the NutchDocument. + each property value is always an array of Objects. + </description> +</property> + +<!-- index-static plugin properties --> + +<property> + <name>index.static</name> + <value></value> + <description> + Used by plugin index-static to adds fields with static data at indexing time. + You can specify a comma-separated list of fieldname:fieldcontent per Nutch job. + Each fieldcontent can have multiple values separated by space, e.g., + field1:value1.1 value1.2 value1.3,field2:value2.1 value2.2 ... + It can be useful when collections can't be created by URL patterns, + like in subcollection, but on a job-basis. + </description> +</property> + +<property> + <name>index.static.fieldsep</name> + <value>,</value> + <description> + Used by plugin index-static to parse the property index.static. Default: comma. + This delimiter is used to separate individual field specifications in the property. + </description> +</property> + +<property> + <name>index.static.keysep</name> + <value>:</value> + <description> + Used by plugin index-static to parse the property index.static. Default: colon. + This delimiter is used to separate the field name from the field value in the field specification. + </description> +</property> + +<property> + <name>index.static.valuesep</name> + <value> </value> + <description> + Used by plugin index-static to parse the property index.static. Default: space. + This delimiter is used to separate multiple field values in the value setting of the field specification. + </description> +</property> + + +<!-- index-metadata plugin properties --> + +<property> + <name>index.parse.md</name> + <value>metatag.description,metatag.keywords</value> + <description> + Comma-separated list of keys to be taken from the parse metadata to generate fields. + Can be used e.g. for 'description' or 'keywords' provided that these values are generated + by a parser (see parse-metatags plugin) + </description> +</property> + +<property> + <name>index.content.md</name> + <value></value> + <description> + Comma-separated list of keys to be taken from the content metadata to generate fields. + </description> +</property> + +<property> + <name>index.db.md</name> + <value></value> + <description> + Comma-separated list of keys to be taken from the crawldb metadata to generate fields. + Can be used to index values propagated from the seeds with the plugin urlmeta + </description> +</property> + +<property> + <name>index.metadata.separator</name> + <value></value> + <description> + Separator to use if you want to index multiple values for a given field. Leave empty to + treat each value as a single value. + </description> +</property> + +<!-- index-geoip plugin properties --> +<property> + <name>index.geoip.usage</name> + <value>insightsService</value> + <description> + A string representing the information source to be used for GeoIP information + association. Either enter 'cityDatabase', 'connectionTypeDatabase', + 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the + Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, + GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and + available at runtime. + </description> +</property> + +<property> + <name>index.geoip.userid</name> + <value></value> + <description> + The userId associated with the GeoIP2 Precision Services account. + </description> +</property> + +<property> + <name>index.geoip.licensekey</name> + <value></value> + <description> + The license key associated with the GeoIP2 Precision Services account. + </description> +</property> + +<property> + <name>index.replace.regexp</name> + <value/> + <description>Allows indexing-time regexp replace manipulation of metadata fields. + The format of the property is a list of regexp replacements, one line per field being + modified. Include index-replace in your plugin.includes. + + Example: + hostmatch=.*somedomain.com + fldname1=/regexp/replacement/flags + fldname2=/regexp/replacement/flags + + Field names would be one of those from + https://cwiki.apache.org/confluence/display/NUTCH/IndexStructure + See https://cwiki.apache.org/confluence/display/NUTCH/IndexReplace for further details. + </description> +</property> + +<!-- parse-metatags plugin properties --> +<property> + <name>metatags.names</name> + <value>description,keywords</value> + <description> Names of the metatags to extract, separated by ','. + Use '*' to extract all metatags. Prefixes the names with 'metatag.' + in the parse-metadata. For instance to index description and keywords, + you need to activate the plugin index-metadata and set the value of the + parameter 'index.parse.md' to 'metatag.description,metatag.keywords'. + </description> +</property> + +<!-- Temporary Hadoop 0.17.x workaround. --> + +<property> + <name>hadoop.job.history.user.location</name> + <value>${hadoop.log.dir}/history/user</value> + <description>Hadoop 0.17.x comes with a default setting to create + user logs inside the output path of the job. This breaks some + Hadoop classes, which expect the output to contain only + part-XXXXX files. This setting changes the output to a + subdirectory of the regular log directory. + </description> +</property> + +<property> + <name>io.serializations</name> + <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value> + <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization, + org.apache.hadoop.io.serializer.avro.AvroReflectSerialization, + org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, --> + <description>A list of serialization classes that can be used for + obtaining serializers and deserializers.</description> +</property> + +<!-- linkrank scoring properties --> + +<property> + <name>link.ignore.internal.host</name> + <value>true</value> + <description>Ignore outlinks to the same hostname.</description> +</property> + +<property> + <name>link.ignore.internal.domain</name> + <value>true</value> + <description>Ignore outlinks to the same domain.</description> +</property> + +<property> + <name>link.ignore.limit.page</name> + <value>true</value> + <description>Limit to only a single outlink to the same page.</description> +</property> + +<property> + <name>link.ignore.limit.domain</name> + <value>true</value> + <description>Limit to only a single outlink to the same domain.</description> +</property> + +<property> + <name>link.analyze.num.iterations</name> + <value>10</value> + <description>The number of LinkRank iterations to run.</description> +</property> + +<property> + <name>link.analyze.initial.score</name> + <value>1.0f</value> + <description>The initial score.</description> +</property> + +<property> + <name>link.analyze.damping.factor</name> + <value>0.85f</value> + <description>The damping factor.</description> +</property> + +<property> + <name>link.delete.gone</name> + <value>false</value> + <description>Whether to delete gone pages from the web graph.</description> +</property> + +<property> + <name>link.loops.depth</name> + <value>2</value> + <description>The depth for the loops algorithm.</description> +</property> + +<property> + <name>link.score.updater.clear.score</name> + <value>0.0f</value> + <description>The default score for URLs that are not in the web graph.</description> +</property> + +<property> + <name>mapreduce.fileoutputcommitter.marksuccessfuljobs</name> + <value>false</value> + <description>Hadoop >= 0.21 generates SUCCESS files in the output which can crash + the readers. This should not be an issue once Nutch is ported to the new MapReduce API + but for now this parameter should prevent such cases. + </description> +</property> + +<!-- subcollection properties --> + +<property> + <name>subcollection.default.fieldname</name> + <value>subcollection</value> + <description> + The default field name for the subcollections. + </description> +</property> + +<property> + <name>subcollection.case.insensitive</name> + <value>false</value> + <description> + Whether the URL prefixes are to be treated case insensitive. + </description> +</property> + +<!-- Headings plugin properties --> + +<property> + <name>headings</name> + <value>h1,h2</value> + <description>Comma separated list of headings to retrieve from the document</description> +</property> + +<property> + <name>headings.multivalued</name> + <value>false</value> + <description>Whether to support multivalued headings.</description> +</property> + +<!-- mimetype-filter plugin properties --> + +<property> + <name>mimetype.filter.file</name> + <value>mimetype-filter.txt</value> + <description> + The configuration file for the mimetype-filter plugin. This file contains + the rules used to allow or deny the indexing of certain documents. + </description> +</property> + +<!-- plugin properties that applies to lib-selenium, protocol-selenium, + protocol-interactiveselenium, lib-htmlunit, protocol-htmlunit --> + +<property> + <name>page.load.delay</name> + <value>3</value> + <description> + The delay in seconds to use when loading a page with htmlunit or selenium. + </description> +</property> + +<property> + <name>take.screenshot</name> + <value>false</value> + <description> + Boolean property determining whether the protocol-htmlunit + WebDriver should capture a screenshot of the URL. If set to + true remember to define the 'screenshot.location' + property as this determines the location screenshots should be + persisted to on HDFS. If that property is not set, screenshots + are simply discarded. + </description> +</property> + +<property> + <name>screenshot.location</name> + <value></value> + <description> + The location on disk where a URL screenshot should be saved + to if the 'take.screenshot' property is set to true. + By default this is null, in this case screenshots held in memory + are simply discarded. + </description> +</property> + +<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit --> + +<property> + <name>htmlunit.enable.javascript</name> + <value>true</value> + <description> + A Boolean value representing if javascript should + be enabled or disabled when using htmlunit. The default value is enabled. + </description> +</property> + +<property> + <name>htmlunit.javascript.timeout</name> + <value>3500</value> + <description> + The timeout in milliseconds when loading javascript with lib-htmlunit. This + setting is used by protocol-htmlunit since they depending on + lib-htmlunit for fetching. + </description> +</property> + +<property> + <name>htmlunit.enable.css</name> + <value>false</value> + <description> + A Boolean value representing if CSS should + be enabled or disabled when using htmlunit. The default value is disabled. + </description> +</property> + +<!-- protocol-selenium plugin properties --> + +<property> + <name>selenium.driver</name> + <value>firefox</value> + <description> + A String value representing the flavour of Selenium + WebDriver() to use. Currently the following options + exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'. + If 'remote' is used it is essential to also set correct properties for + 'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host', + 'selenium.hub.protocol', 'selenium.grid.driver', 'selenium.grid.binary' + and 'selenium.enable.headless'. + </description> +</property> + +<property> + <name>selenium.hub.port</name> + <value>4444</value> + <description>Selenium Hub Location connection port</description> +</property> + +<property> + <name>selenium.hub.path</name> + <value>/wd/hub</value> + <description>Selenium Hub Location connection path</description> +</property> + +<property> + <name>selenium.hub.host</name> + <value>localhost</value> + <description>Selenium Hub Location connection host</description> +</property> + +<property> + <name>selenium.hub.protocol</name> + <value>http</value> + <description>Selenium Hub Location connection protocol</description> +</property> + +<property> + <name>selenium.grid.driver</name> + <value>firefox</value> + <description>A String value representing the flavour of Selenium + WebDriver() used on the selenium grid. We must set `selenium.driver` to `remote` first. + Currently the following options + exist - 'firefox', 'chrome', 'random' </description> +</property> + +<property> + <name>selenium.grid.binary</name> + <value></value> + <description>A String value representing the path to the browser binary + location for each node + </description> +</property> + +<!-- headless options for Firefox and Chrome--> +<property> + <name>selenium.enable.headless</name> + <value>false</value> + <description>A Boolean value representing the headless option + for Firefix and Chrome drivers + </description> +</property> +<!-- selenium firefox configuration; + applies to protocol-selenium and protocol-interactiveselenium plugins --> +<property> + <name>selenium.firefox.allowed.hosts</name> + <value>localhost</value> + <description>A String value representing the allowed hosts preference + according to the operating system hosts file (Example - /etc/hosts in Unix). + Currently this option exist for - 'firefox' </description> +</property> + +<property> + <name>selenium.firefox.binary.timeout</name> + <value>45</value> + <description>A Long value representing the timeout value + for firefox to be available for command execution. The value is in seconds. + Currently this option exist for - 'firefox' </description> +</property> + +<property> + <name>selenium.firefox.enable.flash</name> + <value>false</value> + <description>A Boolean value representing if flash should + be enabled or disabled. The default value is disabled. + Currently this option exist for - 'firefox' </description> +</property> + +<property> + <name>selenium.firefox.load.image</name> + <value>1</value> + <description>An Integer value representing the restriction on + loading images. The default value is no restriction i.e. load all images. + Other options are: + 1: Load all images, regardless of origin + 2: Block all images + 3: Prevent third-party images from loading + Currently this option exist for - 'firefox' </description> +</property> + +<property> + <name>selenium.firefox.load.stylesheet</name> + <value>1</value> + <description>An Integer value representing the restriction on + loading stylesheet. The default value is no restriction i.e. load + all stylesheet. + Other options are: + 1: Load all stylesheet + 2: Block all stylesheet + Currently this option exist for - 'firefox' </description> +</property> + +<!-- selenium chrome configurations --> +<property> + <name>webdriver.chrome.driver</name> + <value>/root/chromedriver</value> + <description>The path to the ChromeDriver binary</description> +</property> +<!-- end of selenium chrome configurations --> + +<!-- protocol-interactiveselenium configuration --> +<property> + <name>interactiveselenium.handlers</name> + <value>DefaultHandler</value> + <description> + A comma separated list of Selenium handlers that should be run for a given + URL. The DefaultHandler causes the same functionality as protocol-selenium. + Custom handlers can be implemented in the plugin package and included here. + </description> +</property> + +<property> + <name>store.http.request</name> + <value>false</value> + <description> + Store the raw request made by Nutch, required to use the CommonCrawlDataDumper + tool for the WARC format. + </description> +</property> + +<property> + <name>store.http.headers</name> + <value>false</value> + <description> + Store the raw headers received by Nutch from the server, required to use the + CommonCrawlDataDumper tool for the WARC format. + </description> +</property> + +<!-- index-links plugin --> + +<property> + <name>index.links.outlinks.host.ignore</name> + <value>false</value> + <description> + Ignore outlinks that point out to the same host as the URL being indexed. + By default all outlinks are indexed. If db.ignore.internal.links is true (default + value), this setting does nothing since the internal links are already + ignored. + </description> +</property> + +<property> + <name>index.links.inlinks.host.ignore</name> + <value>false</value> + <description> + Ignore inlinks coming from the same host as the URL being indexed. By default + all inlinks are indexed. If db.ignore.internal.links is true (default + value), this setting does nothing since the internal links are already + ignored. + </description> +</property> + +<property> + <name>index.links.hosts.only</name> + <value>false</value> + <description> + This force the index-links plugin to only index the host portion of the inlinks + or outlinks. + </description> +</property> + +<!-- HostDB settings --> +<property> + <name>hostdb.recheck.interval</name> + <value>86400000</value> + <description> + Interval between rechecks in milliseconds. Default is one week. Recheck + interval is multiplied by the number of DNS lookup failures for a given + host. + </description> +</property> + +<property> + <name>hostdb.purge.failed.hosts.threshold</name> + <value>3</value> + <description> + If hosts have more failed DNS lookups than this threshold, they are + removed from the HostDB. Hosts can, of course, return if they are still + present in the CrawlDB. + </description> +</property> + +<property> + <name>hostdb.num.resolvers.threads</name> + <value>25</value> + <description> + Number of resolver threads per reducer. Make sure your DNS resolver is + capable of handling this value multiplied by the number of reducers. + </description> +</property> + +<property> + <name>hostdb.check.failed</name> + <value>true</value> + <description> + True if hosts for which DNS lookup failed are eligible for recheck. If + false, hosts that failed DNS lookup more than 0 times are not eligible + for DNS lookup. + </description> +</property> + +<property> + <name>hostdb.check.new</name> + <value>true</value> + <description> + True if newly discovered hosts eligible for DNS lookup check. If false, + hosts that are just added to the HostDB are not eligible for DNS lookup. + </description> +</property> + +<property> + <name>hostdb.check.known</name> + <value>true</value> + <description> + True if newly already known hosts eligible for DNS lookup check. If false, + known hosts are not eligible for DNS lookup. + </description> +</property> + +<property> + <name>hostdb.force.check</name> + <value>false</value> + <description>
[... 236 lines stripped ...]
