MAHOUT-1775 FileNotFoundException caused by aborting the process of downloading Wikipedia dataset, closes apache/mahout# 162
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/d53f0a5d Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/d53f0a5d Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/d53f0a5d Branch: refs/heads/flink-binding Commit: d53f0a5d78000045bb12e90e3a6808cc2c369450 Parents: e943b0a Author: smarthi <[email protected]> Authored: Sun Oct 25 00:29:47 2015 -0400 Committer: smarthi <[email protected]> Committed: Sun Oct 25 09:57:46 2015 -0400 ---------------------------------------------------------------------- examples/bin/classify-wikipedia.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/d53f0a5d/examples/bin/classify-wikipedia.sh ---------------------------------------------------------------------- diff --git a/examples/bin/classify-wikipedia.sh b/examples/bin/classify-wikipedia.sh index 470a81c..68487dc 100755 --- a/examples/bin/classify-wikipedia.sh +++ b/examples/bin/classify-wikipedia.sh @@ -63,6 +63,8 @@ if [ "x$alg" != "xclean" ]; then mkdir -p ${WORK_DIR} if [ ! -e ${WORK_DIR}/wikixml ]; then mkdir -p ${WORK_DIR}/wikixml + fi + if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then echo "Downloading wikipedia XML dump" ######################################################## # Datasets: uncomment and run "clean" to change dataset @@ -74,10 +76,11 @@ if [ "x$alg" != "xclean" ]; then ######### full wikipedia dump: 10G zipped #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ######################################################## - - echo "Extracting..." + fi + if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then + echo "Extracting..." - cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd .. + cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd .. fi echo $START_PATH @@ -186,4 +189,4 @@ elif [ "x$alg" == "xclean" ]; then rm -rf $WORK_DIR $DFSRM $WORK_DIR fi -# Remove the work directory \ No newline at end of file +# Remove the work directory
