This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 0e28af6 fixed hdfs file checks in crawl script
0e28af6 is described below
commit 0e28af65d8594e92d5818fe1c5a83b8f3c491d28
Author: Semyon Semyonov <[email protected]>
AuthorDate: Tue Mar 6 11:19:04 2018 +0100
fixed hdfs file checks in crawl script
---
src/bin/crawl | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/src/bin/crawl b/src/bin/crawl
index 7a32be2..dc32367 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -219,8 +219,19 @@ function __bin_nutch {
fi
}
+#check if directory exists locally or on hdfs
+function __directory_exists {
+ if [[ "$mode" == local && -d "$1" ]]; then
+ return 0
+ elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then
+ return 0
+ else
+ return 1
+ fi
+}
+
function __update_hostdb {
- if [[ -d "$CRAWL_PATH"/crawldb ]]; then
+ if __directory_exists "$CRAWL_PATH"/crawldb; then
echo "Updating HostDB"
__bin_nutch updatehostdb -crawldb "$CRAWL_PATH"/crawldb -hostdb
"$CRAWL_PATH"/hostdb
fi
@@ -261,7 +272,7 @@ do
[[ $a -eq 1 ]] && __update_hostdb
# sitemap processing based on HostDB
- if [[ -d "$CRAWL_PATH"/hostdb ]]; then
+ if __directory_exists "$CRAWL_PATH"/hostdb; then
echo "Processing sitemaps based on hosts in HostDB"
__bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
-threads $NUM_THREADS
fi
--
To stop receiving notification emails like this one, please contact
[email protected].