The problem continues, i have the following script.

@echo off

rem *** Do not allow this script to permanently modify environment variables
and enable
rem *** delayed expansion so for loops can accumulate into a variable using
! instead of %
setlocal ENABLEDELAYEDEXPANSION
SETLOCAL ENABLEEXTENSIONS
set Fdepth=300
set threads=10
rem set topN=-topN 15
set topN=""
rem = -1 if you don't want to set topN value

set urlDir=.\urls

rem *** Require JAVA_HOME
if "X%JAVA_HOME%" == "X" goto error


rem *** Setup the basic parameters
if "X%NUTCH_HOME%"     == "X" set NUTCH_HOME=%CD%\..\..
if "X%JAVA%"           == "X" set JAVA=%JAVA_HOME%\bin\java.exe
if "X%JAVA_HEAP_MAX%"  == "X" set JAVA_HEAP_MAX=-Xmx1000m
if "X%NUTCH_LOG_DIR%"  == "X" set NUTCH_LOG_DIR=%NUTCH_HOME%\logs
if "X%NUTCH_LOG_FILE%" == "X" set NUTCH_LOG_FILE=hadoop.log
set NUTCH_LOG_OPTS="-Dhadoop.log.dir=%NUTCH_LOG_DIR%"
"-Dhadoop.log.file=%NUTCH_LOG_FILE%"
set CLASSPATH=%NUTCH_HOME%;%NUTCH_HOME%\conf;%JAVA_HOME%\lib\tools.jar

rem *** Add Nutch job file(s) to the class path
for /f %%G IN ('dir /b ^"%NUTCH_HOME%\nutch-*.job^"') do set
CLASSPATH=%CLASSPATH%;%NUTCH_HOME%\%%G

rem *** Add Nutch .jar file(s) to the class path
for /f %%G IN ('dir /b ^"%NUTCH_HOME%\lib\*.jar^"') do set
CLASSPATH=!CLASSPATH!;%NUTCH_HOME%\lib\%%G

rem *** Add Nutch .jar file(s) from jetty to the class path
for /f %%G IN ('dir /b ^"%NUTCH_HOME%\lib\jetty-ext\*.jar^"') do set
CLASSPATH=!CLASSPATH!;%NUTCH_HOME%\lib\jetty-ext\%%G



rem *** Revamp the path
set PATH=/bin;%CD%\..\..\bin

                        echo 
"**************************************************************"
                        echo "--------------------- NUTCH vODAFONE 
--------------------------"
                        echo 
"**************************************************************"

                set steps=2

                echo 
"**************************************************************"
                echo "--- Inject first urls---"
                echo 
"**************************************************************"
                echo "----- Inject (Step 1 of %steps%) -----"
                "%JAVA%" %JAVA_HEAP_MAX% %NUTCH_LOG_OPTS% %NUTCH_OPTS% 
-classpath
"%CLASSPATH%" org.apache.nutch.crawl.InjectorJob %urlDir%

                echo 
"**************************************************************"
                echo "----- Generate, Fetch, Parse, Update (Step 2 of %steps%) 
-----"
                echo 
"**************************************************************"

                for /l %%d in (1, 1, %Fdepth%) do (

                echo 
"**************************************************************"
                echo "--- Beginning GENERATE at depth %%d ---"
                echo 
"**************************************************************"

                "%JAVA%" %JAVA_HEAP_MAX% %NUTCH_LOG_OPTS% %NUTCH_OPTS% 
-classpath
"%CLASSPATH%" org.apache.nutch.crawl.GeneratorJob %topN% 
                echo "batch-id"
                set /p batchid="Enter ID: " %=%
                echo !batchid!
                        echo   !batchid! batch id***********
                if  NOT "%errorlevel%"=="0" (
                        echo "runbot: Stopping at depth %%d. No more URLs to 
fetch."
                                EXIT
                                )
                        
                        echo 
"**************************************************************"
                        echo "--- Beginning FETCH at depth %%d ---"
                        echo 
"**************************************************************"           
                        "%JAVA%" %JAVA_HEAP_MAX% %NUTCH_LOG_OPTS% %NUTCH_OPTS% 
-classpath
"%CLASSPATH%" org.apache.nutch.fetcher.FetcherJob -batchId %batchid%
                        
                if NOT "%errorlevel%"=="0" ( 
                echo "runbot: fetch  at depth %%d failed."
                        rem echo "runbot: Deleting segment $segment."
                        )
                
                        echo 
"**************************************************************"
                        echo "--- Beginning PARSE at depth %%d ---"
                        echo 
"**************************************************************"   
                
                        "%JAVA%" %JAVA_HEAP_MAX% %NUTCH_LOG_OPTS% %NUTCH_OPTS% 
-classpath
"%CLASSPATH%" org.apache.nutch.parse.ParserJob  -batchId %batchid%
        
                        if  NOT "%errorlevel%"=="0" (
                        echo "runbot: Stopping at depth %%d. error in parsejob."
                        EXIT
                                )
                                        
                                        
                        echo 
"**************************************************************"
                        echo "--- Beginning UPDATEDB at depth %%d ---"
                        echo 
"**************************************************************"   
                        "%JAVA%" %JAVA_HEAP_MAX% %NUTCH_LOG_OPTS% %NUTCH_OPTS% 
-classpath
"%CLASSPATH%" org.apache.nutch.crawl.DbUpdaterJob 
                        if  NOT "%errorlevel%"=="0" (
                        echo "runbot: Stopping at depth %%d. error in updater."
                                EXIT
                                )
                )


                        echo 
"**************************************************************"
                        echo 
"**************************************************************"
                        echo 
"**************************************************************"

                        echo "FIM"      
                        echo ""


:error
echo "ERROR: You must specify the path to your Java installation in the
JAVA_HOME environment variable
color 00

:done
rem *** Restore environment variables
echo "FIM"
endlocal

hugo.ma wrote
> 
> For example i put in a seed file the url  nabble.com
> Then nutch fetch and parse the url, from the parse i get nabble.com/user
> and nabble.com/admin
> Then in the next fetch job the three urls are fetched and parsed:
> nabble.com
> nabble.com/user
> nabble.com/admin
> 
> And this process repeats until the end of the depth.(The urls are
> fictitious)
> 
> I left nutch running  on Tuesday around 18.00h and today i checked my
> sqlserver database and the last record was from Wednesday 10:40h. He is
> still running on all urls fetched, around 3400 pages.
> 
> I didnt' checked nutch yesterday because was holiday.
> 




--
View this message in context: 
http://lucene.472066.n3.nabble.com/Crawl-command-help-tp4001595p4002123.html
Sent from the Nutch - User mailing list archive at Nabble.com.

Reply via email to