Remove the k-means part. Otherwise it will just launch clustering after
reuters is done.


On Mon, May 10, 2010 at 8:25 PM, <jeast...@apache.org> wrote:

> Author: jeastman
> Date: Mon May 10 14:55:19 2010
> New Revision: 942766
>
> URL: http://svn.apache.org/viewvc?rev=942766&view=rev
> Log:
> Replacing obsolete build-reuters.sh with one that seems to be working
>
> Modified:
>    lucene/mahout/trunk/examples/bin/build-reuters.sh
>
> Modified: lucene/mahout/trunk/examples/bin/build-reuters.sh
> URL:
> http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/bin/build-reuters.sh?rev=942766&r1=942765&r2=942766&view=diff
>
> ==============================================================================
> --- lucene/mahout/trunk/examples/bin/build-reuters.sh (original)
> +++ lucene/mahout/trunk/examples/bin/build-reuters.sh Mon May 10 14:55:19
> 2010
> @@ -20,8 +20,9 @@
>  #
>  # To run:  change into the mahout/examples directory (the parent of the
> one containing this file) and type:
>  #  bin/build-reuters.sh
> -#
> -#
> +#!/bin/sh
> +
> +cd examples/bin/
>  mkdir -p work
>  if [ ! -e work/reuters-out ]; then
>   if [ ! -e work/reuters-sgm ]; then
> @@ -33,28 +34,10 @@ if [ ! -e work/reuters-out ]; then
>     echo "Extracting..."
>     cd work/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd .. && cd ..
>   fi
> -  echo "Converting to plain text."
> -  mvn -e -q exec:java
>  -Dexec.mainClass="org.apache.lucene.benchmark.utils.ExtractReuters"
> -Dexec.args="work/reuters-sgm work/reuters-out" || exit
> -fi
> -# Create index
> -if [ ! -e work/index ]; then
> -  echo "Creating index";
> -  mvn -e exec:java -Dexec.classpathScope="test"
> -Dexec.mainClass="org.apache.lucene.benchmark.byTask.Benchmark"
> -Dexec.args="bin/lda.algorithm" || ( rm -rf work/index && exit )
> -fi
> -if [ ! -e work/vectors ]; then
> -  echo "Creating vectors from index"
> -  cd ../core
> -  mvn -q install -DskipTests=true
> -  cd ../utils/
> -  mvn -q compile
> -  mvn -e exec:java
> -Dexec.mainClass="org.apache.mahout.utils.vectors.lucene.Driver" \
> -    -Dexec.args="--dir ../examples/work/index/ --field body --dictOut
> ../examples/work/dict.txt \
> -    --output ../examples/work/vectors --minDF 100 --maxDFPercent 97" ||
> exit
> -  cd ../core/
>  fi
> -echo "Running LDA"
> -rm -rf ../examples/work/lda
> -MAVEN_OPTS="-Xmx2G -ea" mvn -e exec:java
> -Dexec.mainClass=org.apache.mahout.clustering.lda.LDADriver -Dexec.args="-i
> ../examples/work/vectors -o ../examples/work/lda/\
> -  -k 20 -v 10000 --maxIter 40"
> -echo "Writing top words for each topic to to examples/work/topics/"
> -mvn -q exec:java
> -Dexec.mainClass="org.apache.mahout.clustering.lda.LDAPrintTopics"
> -Dexec.args="-i `ls -1dtr ../examples/work/lda/state-* | tail -1` -d
> ../examples/work/dict.txt -o ../examples/work/topics/ -w 100"
> +
> +cd ../..
> +./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters
> ./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
> +./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o
> ./examples/bin/work/reuters-out-seqdir -c UTF-8
> +./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o
> ./examples/bin/work/reuters-out-seqdir-sparse
> +./bin/mahout kmeans -i
> ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c
> ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -k 20 -w
>
>
>

Reply via email to