svn commit: r378011 - in /lucene/nutch/trunk/src/plugin: ./ clustering-carrot2/ clustering-carrot2/lib/ lib-log4j/ lib-log4j/lib/ parse-pdf/ parse-pdf/lib/ parse-rss/ parse-rss/lib/
Author: jerome Date: Wed Feb 15 06:24:56 2006 New Revision: 378011 URL: http://svn.apache.org/viewcvs?rev=378011view=rev Log: Add a log4j library plugin (lib-log4j) Added: lucene/nutch/trunk/src/plugin/lib-log4j/ lucene/nutch/trunk/src/plugin/lib-log4j/build.xml (with props) lucene/nutch/trunk/src/plugin/lib-log4j/lib/ lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar (with props) lucene/nutch/trunk/src/plugin/lib-log4j/plugin.xml (with props) Removed: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/log4j-1.2.11.jar lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/log4j.LICENSE lucene/nutch/trunk/src/plugin/parse-pdf/lib/log4j-1.2.9.jar lucene/nutch/trunk/src/plugin/parse-pdf/lib/log4j-LICENSE.txt lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar Modified: lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml lucene/nutch/trunk/src/plugin/parse-pdf/build.xml lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml lucene/nutch/trunk/src/plugin/parse-rss/build.xml lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=378011r1=378010r2=378011view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Wed Feb 15 06:24:56 2006 @@ -13,6 +13,7 @@ ant dir=languageidentifier target=deploy/ ant dir=lib-http target=deploy/ ant dir=lib-jakarta-poi target=deploy/ + ant dir=lib-log4j target=deploy/ ant dir=lib-lucene-analyzers target=deploy/ ant dir=lib-parsems target=deploy/ ant dir=nutch-extensionpoints target=deploy/ @@ -78,6 +79,7 @@ ant dir=languageidentifier target=clean/ ant dir=lib-http target=clean/ ant dir=lib-jakarta-poi target=clean/ +ant dir=lib-log4j target=clean/ ant dir=lib-lucene-analyzers target=clean/ ant dir=lib-parsems target=clean/ ant dir=nutch-extensionpoints target=clean/ Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml?rev=378011r1=378010r2=378011view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml Wed Feb 15 06:24:56 2006 @@ -4,4 +4,10 @@ import file=../build-plugin.xml/ + path id=plugin.deps +fileset dir=../lib-log4j/lib + include name=*.jar / +/fileset + /path + /project Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=378011r1=378010r2=378011view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Wed Feb 15 06:24:56 2006 @@ -29,6 +29,7 @@ requires import plugin=nutch-extensionpoints/ + import plugin=lib-log4j/ /requires extension id=org.apache.nutch.clustering.carrot2 Added: lucene/nutch/trunk/src/plugin/lib-log4j/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-log4j/build.xml?rev=378011view=auto == --- lucene/nutch/trunk/src/plugin/lib-log4j/build.xml (added) +++ lucene/nutch/trunk/src/plugin/lib-log4j/build.xml Wed Feb 15 06:24:56 2006 @@ -0,0 +1,17 @@ +?xml version=1.0? + +project name=lib-log4j default=jar + + import file=../build-plugin.xml/ + + !-- + ! Override the compile and jar targets, + ! since there is nothing to compile here. + ! -- + target name=compile depends=init +echo message=Compiling plugin: ${name}/ + /target + + target name=jar depends=compile/ + +/project Propchange: lucene/nutch/trunk/src/plugin/lib-log4j/build.xml -- svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar?rev=378011view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/lib-log4j/plugin.xml URL:
svn commit: r378044 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Wed Feb 15 09:56:54 2006 New Revision: 378044 URL: http://svn.apache.org/viewcvs?rev=378044view=rev Log: Upgrade to latest version of Hadoop. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378044r1=378043r2=378044view=diff == Binary files - no diff available.
svn commit: r378107 - in /lucene/nutch/trunk: conf/ conf/hadoop-env.sh.template conf/slaves.template lib/hadoop-0.1-dev.jar src/java/org/apache/nutch/fetcher/Fetcher.java
Author: cutting Date: Wed Feb 15 14:45:31 2006 New Revision: 378107 URL: http://svn.apache.org/viewcvs?rev=378107view=rev Log: Fix Fetcher to disable speculative exexution, to keep it polite. Also upgrade to latest hadoop jar that supports this feature. Note that Hadoop's environment specification has changed, with all environment variables settable from conf/hadoop-env.sh, and the slaves file is now in conf/, rather than in one's home directory. Added: lucene/nutch/trunk/conf/hadoop-env.sh.template lucene/nutch/trunk/conf/slaves.template Modified: lucene/nutch/trunk/conf/ (props changed) lucene/nutch/trunk/lib/hadoop-0.1-dev.jar lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Propchange: lucene/nutch/trunk/conf/ -- --- svn:ignore (original) +++ svn:ignore Wed Feb 15 14:45:31 2006 @@ -1,5 +1,4 @@ -nutch-site.xml -regex-normalize.xml -crawl-urlfilter.txt -regex-urlfilter.txt -mapred-default.xml +*.xml +*.txt +*.sh +slaves Added: lucene/nutch/trunk/conf/hadoop-env.sh.template URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=378107view=auto == --- lucene/nutch/trunk/conf/hadoop-env.sh.template (added) +++ lucene/nutch/trunk/conf/hadoop-env.sh.template Wed Feb 15 14:45:31 2006 @@ -0,0 +1,25 @@ +# Set Hadoop-specific environment variables here. + +# The java implementation to use. +# export JAVA_HOME=/usr/bin/java + +# The maximum amount of heap to use, in MB. Default is 1000. +# export HADOOP_HEAPSIZE=2000 + +# Extra Java runtime options. Empty by default. +# export HADOOP_OPTS=-server + +# Where log files are stored. $HADOOP_HOME/logs by default. +# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs + +# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. +# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves + +# host:path where hadoop code should be rsync'd from. Unset by default. +# export HADOOP_MASTER=master:/home/$USER/src/hadoop + +# The directory where pid files are stored. /tmp by default. +# export HADOOP_PID_DIR=/var/hadoop/pids + +# A string representing this instance of hadoop. $USER by default. +# export HADOOP_IDENT_STRING=$USER Added: lucene/nutch/trunk/conf/slaves.template URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/slaves.template?rev=378107view=auto == --- lucene/nutch/trunk/conf/slaves.template (added) +++ lucene/nutch/trunk/conf/slaves.template Wed Feb 15 14:45:31 2006 @@ -0,0 +1 @@ +localhost Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378107r1=378106r2=378107view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=378107r1=378106r2=378107view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Feb 15 14:45:31 2006 @@ -348,6 +348,9 @@ job.set(SEGMENT_NAME_KEY, segment.getName()); job.setBoolean(fetcher.parse, parsing); +// for politeness, don't permit parallel execution of a single task +job.setBoolean(mapred.speculative.execution, false); + job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setInputKeyClass(UTF8.class);