[ 
https://issues.apache.org/jira/browse/NUTCH-2491?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16309956#comment-16309956
 ] 

ASF GitHub Bot commented on NUTCH-2491:
---------------------------------------

lewismc closed pull request #270: NUTCH-2491: Integrate sitemap processing and 
HostDB into crawl script
URL: https://github.com/apache/nutch/pull/270
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/bin/crawl b/src/bin/crawl
index da169353a..c92e5b46f 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] [-s <Seed Dir>] 
<Crawl Dir> <Num Rounds>
+# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] [-s <SeedDir>] [-sm 
<SitemapDir>] <CrawlDir> <NumRounds>
 #    -i|--index      Indexes crawl results into a configured indexer
 #    -w|--wait       NUMBER[SUFFIX] Time to wait before generating a new 
segment when no URLs
 #                    are scheduled for fetching. Suffix can be: s for second,
@@ -23,8 +23,9 @@
 #                    specified second is used by default.
 #    -D              A Java property to pass to Nutch calls
 #    -s              Path to seeds file(s)
-#    Crawl Dir       Directory where the crawl/link/segments dirs are saved
-#    Num Rounds      The number of rounds to run this crawl for
+#    -sm             Path to sitemap URL file(s)
+#    CrawlDir        Directory where the crawl/link/segments dirs are saved
+#    NumRounds       The number of rounds to run this crawl for
 #
 #
 # UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK 
INVERSION AND
@@ -73,6 +74,10 @@ do
             SEEDDIR="${2}"
             shift 2
             ;;
+        -sm)
+            SITEMAPDIR="${2}"
+            shift 2
+            ;;
         -w|--wait)
             WAIT="${2}"
             shift 2
@@ -84,16 +89,17 @@ do
 done
 
 if [[ $# != 2 ]]; then
-    echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] [-s <Seed 
Dir>] <Crawl Dir> <Num Rounds>"
+    echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] [-s 
<SeedDir>] [-sm <SitemapDir>] <CrawlDir> <NumRounds>"
     echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
     echo -e "\t-D\t\tA Java property to pass to Nutch calls"
     echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new 
segment when no URLs"
     echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
     echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
     echo -e "\t\t\tspecified second is used by default."
-    echo -e "\t-s Seed Dir\tPath to seeds file(s)"
-    echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are 
saved"
-    echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
+    echo -e "\t-s SeedDir\tPath to seeds file(s)"
+    echo -e "\t-sm SitemapDir\tPath to sitemap URL file(s)"
+    echo -e "\tCrawlDir\tDirectory where the crawl/host/link/segments dirs are 
saved"
+    echo -e "\tNumRounds\tThe number of rounds to run this crawl for"
     exit 1
 fi
 
@@ -168,19 +174,23 @@ function __bin_nutch {
 }
 
 # initial injection
-if [[ !  -z  $SEEDDIR  ]]
-then 
+if [[ !  -z  $SEEDDIR  ]]; then
   echo "Injecting seed URLs"
   __bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
 fi
 
+# sitemap processing based on sitemap definition file(s)
+if [[ ! -z $SITEMAPDIR ]]; then
+  echo "Processing sitemaps defined in $SITEMAPDIR"
+  __bin_nutch sitemap "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" 
-threads $numThreads
+fi
+
 # main loop : rounds of generate - fetch - parse - update
 for ((a=1; ; a++))
 do
-  if [ -e ".STOP" ]
-  then
-   echo "STOP file found - escaping loop"
-   break
+  if [ -e ".STOP" ]; then
+    echo "STOP file found - escaping loop"
+    break
   fi
 
   if [ $LIMIT -ne -1 ]; then
@@ -193,6 +203,18 @@ do
     echo `date` ": Iteration $a"
   fi
 
+  # create / update HostDB
+  if [[ -d "$CRAWL_PATH"/crawldb ]]; then
+    echo "Updating HostDB"
+    __bin_nutch updatehostdb -crawldb "$CRAWL_PATH"/crawldb -hostdb 
"$CRAWL_PATH"/hostdb
+  fi
+
+  # sitemap processing based on HostDB
+  if [[ -d "$CRAWL_PATH"/hostdb ]]; then
+    echo "Processing sitemaps based on hosts in HostDB"
+    __bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb 
-threads $numThreads
+  fi
+
   echo "Generating a new segment"
   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments 
-topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
   echo "$bin/nutch generate ${generate_args[@]}"


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Integrate sitemap processing and HostDB into crawl script
> ---------------------------------------------------------
>
>                 Key: NUTCH-2491
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2491
>             Project: Nutch
>          Issue Type: Improvement
>            Reporter: Moreno Feltscher
>            Assignee: Moreno Feltscher
>            Priority: Minor
>             Fix For: 1.15
>
>
> Add three new steps to the crawl bash script:
> 1. Generate HostDB from CrawlDB
> 2. Inject URLs from sitemaps URLs found in hosts from HostDb
> 3. If given, inject sitemap URLs specified in a configuration file / in 
> configuration files



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to