svn commit: r378011 - in /lucene/nutch/trunk/src/plugin: ./ clustering-carrot2/ clustering-carrot2/lib/ lib-log4j/ lib-log4j/lib/ parse-pdf/ parse-pdf/lib/ parse-rss/ parse-rss/lib/

2006-02-15 Thread jerome
Author: jerome
Date: Wed Feb 15 06:24:56 2006
New Revision: 378011

URL: http://svn.apache.org/viewcvs?rev=378011view=rev
Log:
Add a log4j library plugin (lib-log4j)

Added:
lucene/nutch/trunk/src/plugin/lib-log4j/
lucene/nutch/trunk/src/plugin/lib-log4j/build.xml   (with props)
lucene/nutch/trunk/src/plugin/lib-log4j/lib/
lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar   (with props)
lucene/nutch/trunk/src/plugin/lib-log4j/plugin.xml   (with props)
Removed:
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/log4j-1.2.11.jar
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/log4j.LICENSE
lucene/nutch/trunk/src/plugin/parse-pdf/lib/log4j-1.2.9.jar
lucene/nutch/trunk/src/plugin/parse-pdf/lib/log4j-LICENSE.txt
lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar
Modified:
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
lucene/nutch/trunk/src/plugin/parse-pdf/build.xml
lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rss/build.xml
lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=378011r1=378010r2=378011view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Wed Feb 15 06:24:56 2006
@@ -13,6 +13,7 @@
  ant dir=languageidentifier target=deploy/
  ant dir=lib-http target=deploy/
  ant dir=lib-jakarta-poi target=deploy/
+ ant dir=lib-log4j target=deploy/
  ant dir=lib-lucene-analyzers target=deploy/
  ant dir=lib-parsems target=deploy/
  ant dir=nutch-extensionpoints target=deploy/
@@ -78,6 +79,7 @@
 ant dir=languageidentifier target=clean/
 ant dir=lib-http target=clean/
 ant dir=lib-jakarta-poi target=clean/
+ant dir=lib-log4j target=clean/
 ant dir=lib-lucene-analyzers target=clean/
 ant dir=lib-parsems target=clean/
 ant dir=nutch-extensionpoints target=clean/

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml?rev=378011r1=378010r2=378011view=diff
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml Wed Feb 15 
06:24:56 2006
@@ -4,4 +4,10 @@
 
   import file=../build-plugin.xml/
 
+  path id=plugin.deps
+fileset dir=../lib-log4j/lib
+  include name=*.jar /
+/fileset
+  /path
+
 /project

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=378011r1=378010r2=378011view=diff
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Wed Feb 15 
06:24:56 2006
@@ -29,6 +29,7 @@
 
requires
   import plugin=nutch-extensionpoints/
+  import plugin=lib-log4j/
/requires
 
extension id=org.apache.nutch.clustering.carrot2

Added: lucene/nutch/trunk/src/plugin/lib-log4j/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-log4j/build.xml?rev=378011view=auto
==
--- lucene/nutch/trunk/src/plugin/lib-log4j/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-log4j/build.xml Wed Feb 15 06:24:56 2006
@@ -0,0 +1,17 @@
+?xml version=1.0?
+
+project name=lib-log4j default=jar
+
+  import file=../build-plugin.xml/
+
+  !--
+   ! Override the compile and jar targets,
+   ! since there is nothing to compile here.
+   ! --
+  target name=compile depends=init
+echo message=Compiling plugin: ${name}/
+  /target
+
+  target name=jar depends=compile/
+
+/project

Propchange: lucene/nutch/trunk/src/plugin/lib-log4j/build.xml
--
svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar?rev=378011view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/lib-log4j/plugin.xml
URL: 

svn commit: r378044 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-02-15 Thread cutting
Author: cutting
Date: Wed Feb 15 09:56:54 2006
New Revision: 378044

URL: http://svn.apache.org/viewcvs?rev=378044view=rev
Log:
Upgrade to latest version of Hadoop.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378044r1=378043r2=378044view=diff
==
Binary files - no diff available.




svn commit: r378107 - in /lucene/nutch/trunk: conf/ conf/hadoop-env.sh.template conf/slaves.template lib/hadoop-0.1-dev.jar src/java/org/apache/nutch/fetcher/Fetcher.java

2006-02-15 Thread cutting
Author: cutting
Date: Wed Feb 15 14:45:31 2006
New Revision: 378107

URL: http://svn.apache.org/viewcvs?rev=378107view=rev
Log:
Fix Fetcher to disable speculative exexution, to keep it polite.  Also upgrade 
to latest hadoop jar that supports this  feature.  Note that Hadoop's 
environment specification has changed, with all environment variables settable 
from conf/hadoop-env.sh, and the slaves file is now in conf/, rather than in 
one's home directory.

Added:
lucene/nutch/trunk/conf/hadoop-env.sh.template
lucene/nutch/trunk/conf/slaves.template
Modified:
lucene/nutch/trunk/conf/   (props changed)
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Propchange: lucene/nutch/trunk/conf/
--
--- svn:ignore (original)
+++ svn:ignore Wed Feb 15 14:45:31 2006
@@ -1,5 +1,4 @@
-nutch-site.xml
-regex-normalize.xml
-crawl-urlfilter.txt
-regex-urlfilter.txt
-mapred-default.xml
+*.xml
+*.txt
+*.sh
+slaves

Added: lucene/nutch/trunk/conf/hadoop-env.sh.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=378107view=auto
==
--- lucene/nutch/trunk/conf/hadoop-env.sh.template (added)
+++ lucene/nutch/trunk/conf/hadoop-env.sh.template Wed Feb 15 14:45:31 2006
@@ -0,0 +1,25 @@
+# Set Hadoop-specific environment variables here.
+
+# The java implementation to use.
+# export JAVA_HOME=/usr/bin/java
+
+# The maximum amount of heap to use, in MB. Default is 1000.
+# export HADOOP_HEAPSIZE=2000
+
+# Extra Java runtime options.  Empty by default.
+# export HADOOP_OPTS=-server
+
+# Where log files are stored.  $HADOOP_HOME/logs by default.
+# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
+
+# File naming remote slave hosts.  $HADOOP_HOME/conf/slaves by default.
+# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves
+
+# host:path where hadoop code should be rsync'd from.  Unset by default.
+# export HADOOP_MASTER=master:/home/$USER/src/hadoop
+
+# The directory where pid files are stored. /tmp by default.
+# export HADOOP_PID_DIR=/var/hadoop/pids
+
+# A string representing this instance of hadoop. $USER by default.
+# export HADOOP_IDENT_STRING=$USER

Added: lucene/nutch/trunk/conf/slaves.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/slaves.template?rev=378107view=auto
==
--- lucene/nutch/trunk/conf/slaves.template (added)
+++ lucene/nutch/trunk/conf/slaves.template Wed Feb 15 14:45:31 2006
@@ -0,0 +1 @@
+localhost

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378107r1=378106r2=378107view=diff
==
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=378107r1=378106r2=378107view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Feb 
15 14:45:31 2006
@@ -348,6 +348,9 @@
 job.set(SEGMENT_NAME_KEY, segment.getName());
 job.setBoolean(fetcher.parse, parsing);
 
+// for politeness, don't permit parallel execution of a single task
+job.setBoolean(mapred.speculative.execution, false);
+
 job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME));
 job.setInputFormat(InputFormat.class);
 job.setInputKeyClass(UTF8.class);