[
https://issues.apache.org/jira/browse/NUTCH-2491?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16309956#comment-16309956
]
ASF GitHub Bot commented on NUTCH-2491:
---------------------------------------
lewismc closed pull request #270: NUTCH-2491: Integrate sitemap processing and
HostDB into crawl script
URL: https://github.com/apache/nutch/pull/270
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/src/bin/crawl b/src/bin/crawl
index da169353a..c92e5b46f 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -15,7 +15,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] [-s <Seed Dir>]
<Crawl Dir> <Num Rounds>
+# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] [-s <SeedDir>] [-sm
<SitemapDir>] <CrawlDir> <NumRounds>
# -i|--index Indexes crawl results into a configured indexer
# -w|--wait NUMBER[SUFFIX] Time to wait before generating a new
segment when no URLs
# are scheduled for fetching. Suffix can be: s for second,
@@ -23,8 +23,9 @@
# specified second is used by default.
# -D A Java property to pass to Nutch calls
# -s Path to seeds file(s)
-# Crawl Dir Directory where the crawl/link/segments dirs are saved
-# Num Rounds The number of rounds to run this crawl for
+# -sm Path to sitemap URL file(s)
+# CrawlDir Directory where the crawl/link/segments dirs are saved
+# NumRounds The number of rounds to run this crawl for
#
#
# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK
INVERSION AND
@@ -73,6 +74,10 @@ do
SEEDDIR="${2}"
shift 2
;;
+ -sm)
+ SITEMAPDIR="${2}"
+ shift 2
+ ;;
-w|--wait)
WAIT="${2}"
shift 2
@@ -84,16 +89,17 @@ do
done
if [[ $# != 2 ]]; then
- echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] [-s <Seed
Dir>] <Crawl Dir> <Num Rounds>"
+ echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] [-s
<SeedDir>] [-sm <SitemapDir>] <CrawlDir> <NumRounds>"
echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
echo -e "\t-D\t\tA Java property to pass to Nutch calls"
echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new
segment when no URLs"
echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
echo -e "\t\t\tspecified second is used by default."
- echo -e "\t-s Seed Dir\tPath to seeds file(s)"
- echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are
saved"
- echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
+ echo -e "\t-s SeedDir\tPath to seeds file(s)"
+ echo -e "\t-sm SitemapDir\tPath to sitemap URL file(s)"
+ echo -e "\tCrawlDir\tDirectory where the crawl/host/link/segments dirs are
saved"
+ echo -e "\tNumRounds\tThe number of rounds to run this crawl for"
exit 1
fi
@@ -168,19 +174,23 @@ function __bin_nutch {
}
# initial injection
-if [[ ! -z $SEEDDIR ]]
-then
+if [[ ! -z $SEEDDIR ]]; then
echo "Injecting seed URLs"
__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
fi
+# sitemap processing based on sitemap definition file(s)
+if [[ ! -z $SITEMAPDIR ]]; then
+ echo "Processing sitemaps defined in $SITEMAPDIR"
+ __bin_nutch sitemap "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR"
-threads $numThreads
+fi
+
# main loop : rounds of generate - fetch - parse - update
for ((a=1; ; a++))
do
- if [ -e ".STOP" ]
- then
- echo "STOP file found - escaping loop"
- break
+ if [ -e ".STOP" ]; then
+ echo "STOP file found - escaping loop"
+ break
fi
if [ $LIMIT -ne -1 ]; then
@@ -193,6 +203,18 @@ do
echo `date` ": Iteration $a"
fi
+ # create / update HostDB
+ if [[ -d "$CRAWL_PATH"/crawldb ]]; then
+ echo "Updating HostDB"
+ __bin_nutch updatehostdb -crawldb "$CRAWL_PATH"/crawldb -hostdb
"$CRAWL_PATH"/hostdb
+ fi
+
+ # sitemap processing based on HostDB
+ if [[ -d "$CRAWL_PATH"/hostdb ]]; then
+ echo "Processing sitemaps based on hosts in HostDB"
+ __bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
-threads $numThreads
+ fi
+
echo "Generating a new segment"
generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments
-topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
echo "$bin/nutch generate ${generate_args[@]}"
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Integrate sitemap processing and HostDB into crawl script
> ---------------------------------------------------------
>
> Key: NUTCH-2491
> URL: https://issues.apache.org/jira/browse/NUTCH-2491
> Project: Nutch
> Issue Type: Improvement
> Reporter: Moreno Feltscher
> Assignee: Moreno Feltscher
> Priority: Minor
> Fix For: 1.15
>
>
> Add three new steps to the crawl bash script:
> 1. Generate HostDB from CrawlDB
> 2. Inject URLs from sitemaps URLs found in hosts from HostDb
> 3. If given, inject sitemap URLs specified in a configuration file / in
> configuration files
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)