[Nutch-cvs] [Nutch Wiki] Update of "MonitoringNutchCrawls" by SumanKumar

Apache Wiki Thu, 12 Apr 2007 07:49:01 -0700

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change 
notification.


The following page has been changed by SumanKumar:
http://wiki.apache.org/nutch/MonitoringNutchCrawls

------------------------------------------------------------------------------
  
  This will give you minute-by-minute stats on how many pages nutch tried to 
fetch and how many failed with errors (e.g. 404, server unreachable).
  
+ 
+ 
+ === A More expansion of above Script: ===
+ Run this script by changing the three export paths in the script...
+ 
+ {{{
+ #!/bin/bash
+ 
####################################################################################
+ #########################Author: Chalavadi Suman Kumar 
#############################
+ ##########################Email: [EMAIL PROTECTED] 
#############################
+ 
####################################################################################
+ 
+ # Usage: sh monitor.sh existing/running logfile outputlogfile errorlogfile
+ # 'existing' - logfile is not a running log (already existing log)
+ # 'running'  - logfile is a running log (this is the default option)
+ 
+ 
+ #Edit your Log directory names...
+ export LOGFILE=/home/suman/nutch-0.8/logs/hadoop.log
+ export SAVEFILE=/home/suman/NutchMonitor/short_hadoop.log
+ export ERRORFILE=/home/suman/NutchMonitor/logs/hadoop_error.log
+ 
+ #Specify for reading the existing log or running log
+ #By default it assumes 'running'
+ case "$1" in
+ 'existing') COMMAND="cat" ;;
+ 
+ 'running') COMMAND="tail -f" ;;
+ 
+ *) COMMAND="tail -f" ;;
+ esac
+ 
+ #Change the LOGFILE,SAVEFILE and ERRORFILE, if provided through command line
+ #IF '-' is given, take the default values..
+ if [[ "$2" != "" && "$2" != "-" ]]; then LOGFILE=$2; fi
+ if [[ "$3" != "" && "$3" != "-" ]]; then SAVEFILE=$3; fi
+ if [[ "$4" != "" && "$4" != "-" ]]; then ERRORFILE=$4; fi
+ 
+ #Initializing the Variables....
+ minute=0
+ fetchcount=0
+ fetchfailcount=0
+ indexcount=0
+ mflcount=0
+ lasttime=0
+ newtime=0
+ totalfetchcount=0
+ totalindexedcount=0
+ totalfetchfail=0
+ totalmflcount=0
+ 
+ #Reads the appended content of the file as the file grows. So each log is 
processed through the while loop...
+ $COMMAND $LOGFILE | while read some; do
+ 
+ #to aviod exceptions. They some time start with 'java' and 'at'.
+ case $some in
+ [0-9]*)
+ #Get the time(current minute) in 'newtime' and entire date in 'totaltime'
+ newtime=`echo $some|perl -ne 
'@words=split(/\s+/);@temp=split(/:/,$words[1]);print $temp[1]'`
+ #totaltime=`echo $some|awk '{print $1" "$2}'`
+ totaltime=`echo $some|perl -ne '@word=split(/\s+/);print "$word[0] $word[1]"'`
+ ;;
+ *) echo $some >>$ERRORFILE;;
+ esac
+ 
+ #echo $temp $newtime $totaltime $lasttime
+ 
+ #Pattern matches for important operations and prints with the change in 
time...
+ case "$some" in
+ *indexer.Indexer\ -\ Indexer:\ starting*)
+                 indexcount=0;mflcount=0;
+                 ;;
+ 
+ *indexer.Indexer\ -\ Indexer:\ done*)
+                 if [[ $indexcount -ne 0 || $mlfcount -ne 0 ]]; then
+                 echo "$totaltime Indexed:$indexcount 
MaxFieldLengthError:$mflcount TotalPagesIndexed=$totalindexedcount 
TotalMFLErrors=$totalmflcount" >>$SAVEFILE
+                 lasttime="$newtime";
+                 fi
+                 indexcount=0;mflcount=0;
+                 ;;
+ 
+ *fetcher.Fetcher\ -\ Fetcher:\ starting*)
+                 fetchcount=0;fetchfailcount=0;
+                 ;;
+ 
+ *fetcher.Fetcher\ -\ Fetcher:\ done*)
+                 if [[ $fetchcount -ne 0 || $fetchfailcount -ne 0 ]]; then
+                 echo "$totaltime FetchTried:$fetchcount 
FetchFailed:$fetchfailcount TotalPagesFetched=$totalfetchcount 
TotalFetchesFailed=$totalfetchfail" >>$SAVEFILE
+                 lasttime="$newtime";
+                 fi
+                 fetchcount=0;fetchfailcount=0;
+                 ;;
+ 
+ *fetching*)
+                 let fetchcount=$fetchcount+1;
+                 let totalfetchcount=$totalfetchcount+1;
+                 if [ "$newtime" -ne "$lasttime" ]; then
+                 echo "$totaltime FetchTried:$fetchcount 
FetchFailed:$fetchfailcount TotalPagesFetched=$totalfetchcount 
TotalFetchesFailed=$totalfetchfail" >>$SAVEFILE
+                 fetchcount=0;fetchfailcount=0;
+                 lasttime="$newtime";
+                 fi
+                 ;;
+ 
+ *failed*)
+                 let fetchfailcount=$fetchfailcount+1;
+                 let totalfetchfail=$totalfetchfail+1;
+                 if [ "$newtime" -ne "$lasttime" ]; then
+                 echo "$totaltime FetchTried:$fetchcount 
FetchFailed:$fetchfailcount TotalPagesFetched=$totalfetchcount 
TotalFetchesFailed=$totalfetchfail" >>$SAVEFILE
+                 fetchcount=0;fetchfailcount=0;
+                 lasttime="$newtime";
+                 fi
+                 ;;
+ 
+ *Indexing\ Filter*);;
+ *IndexingFilter*);;
+ 
+ *Indexing*)
+                 let indexcount=$indexcount+1;
+                 let totalindexedcount=$totalindexedcount+1;
+                 if [ "$newtime" -ne "$lasttime" ]; then
+                 echo "$totaltime Indexed:$indexcount 
MaxFieldLengthError:$mflcount TotalPagesIndexed=$totalindexedcount 
TotalMFLErrors=$totalmflcount" >>$SAVEFILE
+                 indexcount=0;mflcount=0;
+                 lasttime="$newtime";
+                 fi
+                 ;;
+ 
+ *maxFieldLength*)
+                 let mflcount=$mflcount+1;
+                 let totalmflcount=$totalmflcount+1;
+                 if [ "$newtime" -ne "$lasttime" ]; then
+                 echo "$totaltime Indexed:$indexcount 
MaxFieldLengthError:$mflcount TotalPagesIndexed=$totalindexedcount 
TotalMFLErrors=$totalmflcount" >>$SAVEFILE
+                 indexcount=0;mflcount=0;
+                 lasttime="$newtime";
+                 fi
+                 ;;
+ esac
+ 
+ #lasttime="$newtime";
+ 
+ #print the important operations(stages of crawling).
+ echo $some | perl -ne 'if(/fetcher.Fetcher\ -\ Fetcher:|indexer.Indexer\ -\ 
Indexer:|crawl.CrawlDb\ -\ CrawlDb update:|crawl.Generator\ -\ 
Generator:|crawl.Injector\ -\ Injector:|crawl.LinkDb\ -\ 
LinkDb:|indexer.DeleteDuplicates\ -\ Dedup:|indexer.IndexMerger|crawl.Crawl\ -\ 
crawl/g){print}' >>$SAVEFILE
+ 
+ echo $some | perl -ne 'if(/ERROR|WARN|FATAL/g){print}' >>$ERRORFILE
+ 
+ done
+ 
+ }}}
+ 
+ === Running this script: ===
+  1. Change the exported paths(LOGFILE,SAVEFILE,ERRORFILE) in the script 
according to your nutch location.
+  2. Run the script.
+  3. Monitor your crawl by the following command: tail -f short_hadoop.log
+ 
+ This will give you minute-by-minute stats on how many pages nutch tried to 
fetch and how many failed with errors, how many pages indexed in the last 
minute.
+ 

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] [Nutch Wiki] Update of "MonitoringNutchCrawls" by SumanKumar

Reply via email to