The problem continues, i have the following script.
@echo off
rem *** Do not allow this script to permanently modify environment variables
and enable
rem *** delayed expansion so for loops can accumulate into a variable using
! instead of %
setlocal ENABLEDELAYEDEXPANSION
SETLOCAL ENABLEEXTENSIONS
set Fdepth=300
set threads=10
rem set topN=-topN 15
set topN=""
rem = -1 if you don't want to set topN value
set urlDir=.\urls
rem *** Require JAVA_HOME
if "X%JAVA_HOME%" == "X" goto error
rem *** Setup the basic parameters
if "X%NUTCH_HOME%" == "X" set NUTCH_HOME=%CD%\..\..
if "X%JAVA%" == "X" set JAVA=%JAVA_HOME%\bin\java.exe
if "X%JAVA_HEAP_MAX%" == "X" set JAVA_HEAP_MAX=-Xmx1000m
if "X%NUTCH_LOG_DIR%" == "X" set NUTCH_LOG_DIR=%NUTCH_HOME%\logs
if "X%NUTCH_LOG_FILE%" == "X" set NUTCH_LOG_FILE=hadoop.log
set NUTCH_LOG_OPTS="-Dhadoop.log.dir=%NUTCH_LOG_DIR%"
"-Dhadoop.log.file=%NUTCH_LOG_FILE%"
set CLASSPATH=%NUTCH_HOME%;%NUTCH_HOME%\conf;%JAVA_HOME%\lib\tools.jar
rem *** Add Nutch job file(s) to the class path
for /f %%G IN ('dir /b ^"%NUTCH_HOME%\nutch-*.job^"') do set
CLASSPATH=%CLASSPATH%;%NUTCH_HOME%\%%G
rem *** Add Nutch .jar file(s) to the class path
for /f %%G IN ('dir /b ^"%NUTCH_HOME%\lib\*.jar^"') do set
CLASSPATH=!CLASSPATH!;%NUTCH_HOME%\lib\%%G
rem *** Add Nutch .jar file(s) from jetty to the class path
for /f %%G IN ('dir /b ^"%NUTCH_HOME%\lib\jetty-ext\*.jar^"') do set
CLASSPATH=!CLASSPATH!;%NUTCH_HOME%\lib\jetty-ext\%%G
rem *** Revamp the path
set PATH=/bin;%CD%\..\..\bin
echo
"**************************************************************"
echo "--------------------- NUTCH vODAFONE
--------------------------"
echo
"**************************************************************"
set steps=2
echo
"**************************************************************"
echo "--- Inject first urls---"
echo
"**************************************************************"
echo "----- Inject (Step 1 of %steps%) -----"
"%JAVA%" %JAVA_HEAP_MAX% %NUTCH_LOG_OPTS% %NUTCH_OPTS%
-classpath
"%CLASSPATH%" org.apache.nutch.crawl.InjectorJob %urlDir%
echo
"**************************************************************"
echo "----- Generate, Fetch, Parse, Update (Step 2 of %steps%)
-----"
echo
"**************************************************************"
for /l %%d in (1, 1, %Fdepth%) do (
echo
"**************************************************************"
echo "--- Beginning GENERATE at depth %%d ---"
echo
"**************************************************************"
"%JAVA%" %JAVA_HEAP_MAX% %NUTCH_LOG_OPTS% %NUTCH_OPTS%
-classpath
"%CLASSPATH%" org.apache.nutch.crawl.GeneratorJob %topN%
echo "batch-id"
set /p batchid="Enter ID: " %=%
echo !batchid!
echo !batchid! batch id***********
if NOT "%errorlevel%"=="0" (
echo "runbot: Stopping at depth %%d. No more URLs to
fetch."
EXIT
)
echo
"**************************************************************"
echo "--- Beginning FETCH at depth %%d ---"
echo
"**************************************************************"
"%JAVA%" %JAVA_HEAP_MAX% %NUTCH_LOG_OPTS% %NUTCH_OPTS%
-classpath
"%CLASSPATH%" org.apache.nutch.fetcher.FetcherJob -batchId %batchid%
if NOT "%errorlevel%"=="0" (
echo "runbot: fetch at depth %%d failed."
rem echo "runbot: Deleting segment $segment."
)
echo
"**************************************************************"
echo "--- Beginning PARSE at depth %%d ---"
echo
"**************************************************************"
"%JAVA%" %JAVA_HEAP_MAX% %NUTCH_LOG_OPTS% %NUTCH_OPTS%
-classpath
"%CLASSPATH%" org.apache.nutch.parse.ParserJob -batchId %batchid%
if NOT "%errorlevel%"=="0" (
echo "runbot: Stopping at depth %%d. error in parsejob."
EXIT
)
echo
"**************************************************************"
echo "--- Beginning UPDATEDB at depth %%d ---"
echo
"**************************************************************"
"%JAVA%" %JAVA_HEAP_MAX% %NUTCH_LOG_OPTS% %NUTCH_OPTS%
-classpath
"%CLASSPATH%" org.apache.nutch.crawl.DbUpdaterJob
if NOT "%errorlevel%"=="0" (
echo "runbot: Stopping at depth %%d. error in updater."
EXIT
)
)
echo
"**************************************************************"
echo
"**************************************************************"
echo
"**************************************************************"
echo "FIM"
echo ""
:error
echo "ERROR: You must specify the path to your Java installation in the
JAVA_HOME environment variable
color 00
:done
rem *** Restore environment variables
echo "FIM"
endlocal
hugo.ma wrote
>
> For example i put in a seed file the url nabble.com
> Then nutch fetch and parse the url, from the parse i get nabble.com/user
> and nabble.com/admin
> Then in the next fetch job the three urls are fetched and parsed:
> nabble.com
> nabble.com/user
> nabble.com/admin
>
> And this process repeats until the end of the depth.(The urls are
> fictitious)
>
> I left nutch running on Tuesday around 18.00h and today i checked my
> sqlserver database and the last record was from Wednesday 10:40h. He is
> still running on all urls fetched, around 3400 pages.
>
> I didnt' checked nutch yesterday because was holiday.
>
--
View this message in context:
http://lucene.472066.n3.nabble.com/Crawl-command-help-tp4001595p4002123.html
Sent from the Nutch - User mailing list archive at Nabble.com.