Hi,

I try to set Nutch 2.1 and Solr 4.0 with MySQL database, according to the
instruction in this link: http://nlp.solutions.asia/?p=180.

I made same changes in conf/nutch-site.xml (set threads to 50).

When I start crawl (path: ~/Desktop/apache-nutch-2.1/runtime/local,
command: bin/nutch crawl urls -depth 5 -topN 1) I saw the message:
"Skipping http://www.domainname.com/category/viewvideo/111; different batch
id (null)" for a lot of pages.

My nutch-site.xml file is in attach.

I use Debian 6.0.5 (x64) on Virtual Machine on Windows 7 (x64).

I have many records in database with: headers = null, status = 1, text =
null and the others fields are also null.

In conf/regex-urlfilter.txt I have:

# accept anything else
+^http://([a-z0-9]*\.)*www.domain01.com
+^http://([a-z0-9]*\.)*domain02.com
+^http://([a-z0-9]*\.)*www.domain03.com.mk

In /root/Desktop/apache-nutch-2.1/runtime/local/urls/seed.txt I have:

http://www.domain01.com
http://domain02.com
http://www.domain03.com.mk



Best Regards,

Dragan Menoski
<?xml version="1.0"?>

<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>



<!-- Put site-specific property overrides in this file. -->



<configuration>



	<property>

	<name>http.agent.name</name>

	<value>Dragan's Spider</value>

	</property>



	<property>

	<name>http.accept.language</name>

	<value>mk-mk,mk,en-us,en-gb,en;q=0.7,*;q=0.3</value>

	<description>Value of the “Accept-Language” request header field.

	This allows selecting non-English language as default one to retrieve.

	It is a useful setting for search engines build for certain national group.

	</description>

	</property>



	<property>

	<name>parser.character.encoding.default</name>

	<value>utf-8</value>

	<description>The character encoding to fall back to when no other information

	is available</description>

	</property>



	<property>

	<name>storage.data.store.class</name>

	<value>org.apache.gora.sql.store.SqlStore</value>

	<description>The Gora DataStore class for storing and retrieving data.

	Currently the following stores are available: ….

	</description>

	</property>



	<property>

	  <name>fetcher.threads.fetch</name>

	  <value>50</value>

	  <description>The number of FetcherThreads the fetcher should use.

	    This is also determines the maximum number of requests that are

	    made at once (each FetcherThread handles one connection).</description>

	</property>



	<property>

	  <name>fetcher.threads.per.host</name>

	  <value>1</value>

	  <description>This number is the maximum number of threads that

	    should be allowed to access a host at one time.</description>

	</property> 



	<property>

	  <name>fetcher.threads.per.queue</name>

	  <value>50</value>

	  <description></description>

	</property>



	<property>

	  <name>fetcher.max.crawl.delay</name>

	  <value>5</value>

	  <description>This number is the maximum number of threads that

	    should be allowed to access a host at one time.</description>

	</property> 



	<property>

	  <name>fetcher.server.delay</name>

	  <value>1.0</value>

	  <description>This number is the maximum number of threads that

	    should be allowed to access a host at one time.</description>

	</property>



	<property>

	  <name>fetcher.server.min.delay</name>

	  <value>0.0</value>

	  <description>This number is the maximum number of threads that

	    should be allowed to access a host at one time.</description>

	</property>	



	<property>

	  <name>fetcher.queue.depth.multiplier</name>

	  <value>200</value>

	  <description>This number is the maximum number of threads that

	    should be allowed to access a host at one time.</description>

	</property>

	<property>

	  <name>db.ignore.external.links</name>

	  <value>true</value>

	  <description>

		If true, outlinks leading from a page to external hosts will be ignored. This is an effective way to limit the crawl to include 			only initially injected hosts, without creating complex URLFilters. 

	  </description>

	</property>





	<property>

	  <name>http.timeout</name>

	  <value>5000</value>

	  <description>This number is the maximum number of threads that

	    should be allowed to access a host at one time.</description>

	</property>



	<property>

	  <name>http.max.delays</name>

	  <value>1</value>

	  <description>This number is the maximum number of threads that

	    should be allowed to access a host at one time.</description>

	</property>



	<property>

	  <name>generate.max.count</name>

	  <value>50</value>

	  <description></description>

	</property>



	<property>

	  <name>generate.count.mode</name>

	  <value>host</value>

	  <description></description>

	</property>

<!-- lkjlkjlkjl -->



<property>

          <name>db.signature.class</name>

          <value>org.apache.nutch.crawl.TextProfileSignature</value>

          <description></description>

        </property>

<property>

 <name>db.signature.text_profile.min_token_len</name>

 <value>2</value>

 <description>Minimum token length to be included in the signature.

 </description>

</property>

<property>

 <name>db.signature.text_profile.quant_rate</name>

 <value>0.01</value>

 <description>Profile frequencies will be rounded down to a multiple of

 QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token

 frequency. If maxFreq > 1 then QUANT will be at least 2, which means that

 for longer texts tokens with frequency 1 will always be discarded.

 </description>

</property>

<property>

          <name>db.fetch.schedule.class</name>

          <value>org.apache.nutch.crawl.AdaptiveFetchSchedule</value>

          <description></description>

        </property>



<property> 

 <name>db.fetch.interval.default</name> 

 <value>7200</value> 

 <description>The default number of seconds between re-fetches of a page (30 days). value was 2592000 (30 days) 

 </description> 

</property> 	

	



	



</configuration>

Reply via email to