svn commit: r1387357 - in /nutch/trunk: CHANGES.txt build.xml

2012-09-18 Thread snagel
Author: snagel
Date: Tue Sep 18 20:54:05 2012
New Revision: 1387357

URL: http://svn.apache.org/viewvc?rev=1387357view=rev
Log:
NUTCH-1415 release packages to contain top level folder apache-nutch-x.x

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1387357r1=1387356r2=1387357view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Sep 18 20:54:05 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1415 release packages to contain top level folder apache-nutch-x.x 
(snagel)
+
 * NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy via lewismc)
 
 * NUTCH-1470 Ensure test files are included for runtime testing (lewismc)

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1387357r1=1387356r2=1387357view=diff
==
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Tue Sep 18 20:54:05 2012
@@ -698,14 +698,13 @@
   !-- == --
   target name=tar-src depends=package-src description=-- generate 
src.tar.gz distribution package
 tar compression=gzip longfile=gnu
-  destfile=${src.dist.version.dir}.tar.gz 
basedir=${src.dist.version.dir}
-  tarfileset dir=${dist.dir} mode=664
-   exclude name=${src.dist.version.dir}/bin/* /
-   exclude name=${src.dist.version.dir}/runtime/* /
-include name=${src.dist.version.dir}/** /
+  destfile=${src.dist.version.dir}.tar.gz
+  tarfileset dir=${src.dist.version.dir} mode=664 
prefix=${final.name}
+exclude name=src/bin/* /
+include name=** /
   /tarfileset
-  tarfileset dir=${dist.dir} mode=755
-include name=${src.dist.version.dir}/bin/* /
+  tarfileset dir=${src.dist.version.dir} mode=755 
prefix=${final.name}
+include name=src/bin/* /
   /tarfileset
 /tar
   /target
@@ -715,13 +714,13 @@
   !-- == --
   target name=tar-bin depends=package-bin description=-- generate 
bin.tar.gz distribution package
 tar compression=gzip longfile=gnu
-  destfile=${bin.dist.version.dir}.tar.gz 
basedir=${bin.dist.version.dir}
-  tarfileset dir=${dist.dir} mode=664
-   exclude name=${bin.dist.version.dir}/bin/* /
-include name=${bin.dist.version.dir}/** /
+  destfile=${bin.dist.version.dir}.tar.gz
+  tarfileset dir=${bin.dist.version.dir} mode=664 
prefix=${final.name}
+exclude name=bin/* /
+include name=** /
   /tarfileset
-  tarfileset dir=${dist.dir} mode=755
-include name=${bin.dist.version.dir}/bin/* /
+  tarfileset dir=${bin.dist.version.dir} mode=755 
prefix=${final.name}
+include name=bin/* /
   /tarfileset
 /tar
   /target
@@ -731,14 +730,13 @@
   !-- == --
   target name=zip-src depends=package-src description=-- generate 
src.zip distribution package
zip compress=true casesensitive=yes 
- destfile=${src.dist.version.dir}.zip basedir=${src.dist.version.dir}
-   zipfileset dir=${dist.dir} filemode=664
-   exclude name=${src.dist.version.dir}/bin/* /
-   exclude name=${src.dist.version.dir}/runtime/* /
-   include name=${src.dist.version.dir}/** /
+ destfile=${src.dist.version.dir}.zip
+   zipfileset dir=${src.dist.version.dir} filemode=664 
prefix=${final.name}
+   exclude name=src/bin/* /
+   include name=** /
/zipfileset
-   zipfileset dir=${dist.dir} filemode=755
-   include name=${src.dist.version.dir}/bin/* /
+   zipfileset dir=${src.dist.version.dir} filemode=755 
prefix=${final.name}
+   include name=src/bin/* /
/zipfileset
/zip
   /target
@@ -746,15 +744,15 @@
   !-- == --
   !-- Make bin release zip   --
   !-- == --
-  target name=zip-bin depends=package-bin description=-- generate 
src.zip distribution package
+  target name=zip-bin depends=package-bin description=-- generate 
bin.zip distribution package
zip compress=true casesensitive=yes 
- destfile=${bin.dist.version.dir}.zip basedir=${bin.dist.version.dir}
-   zipfileset dir=${dist.dir} filemode=664
-   exclude name=${bin.dist.version.dir}/bin/* /
-   include name=${bin.dist.version.dir}/** /
+ destfile=${bin.dist.version.dir}.zip
+   zipfileset dir=${bin.dist.version.dir} filemode=664 
prefix=${final.name}
+   exclude name=bin/* /
+   include name=** /
/zipfileset
-   zipfileset dir=${dist.dir} filemode=755
-   include name

svn commit: r1396796 - in /nutch/trunk: CHANGES.txt conf/regex-normalize.xml.template src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test src/plugin/urlnormalizer-regex/sample/regex-nor

2012-10-10 Thread snagel
Author: snagel
Date: Wed Oct 10 21:06:27 2012
New Revision: 1396796

URL: http://svn.apache.org/viewvc?rev=1396796view=rev
Log:
NUTCH-706 Url regex normalizer: pattern for session id removal not to match 
newsId

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/regex-normalize.xml.template

nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test

nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1396796r1=1396795r2=1396796view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Oct 10 21:06:27 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-706 Url regex normalizer: pattern for session id removal not to match 
newsId (Meghna Kukreja via snagel)
+
 * NUTCH-1415 release packages to contain top level folder apache-nutch-x.x 
(snagel)
 
 * NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy via lewismc)

Modified: nutch/trunk/conf/regex-normalize.xml.template
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1396796r1=1396795r2=1396796view=diff
==
--- nutch/trunk/conf/regex-normalize.xml.template (original)
+++ nutch/trunk/conf/regex-normalize.xml.template Wed Oct 10 21:06:27 2012
@@ -29,7 +29,7 @@
 
 !-- removes session ids from urls (such as jsessionid and PHPSESSID) --
 regex
-  
pattern([;_]?((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern
+  
pattern([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern
   substitution$4/substitution
 /regex
 

Modified: 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1396796r1=1396795r2=1396796view=diff
==
--- 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test 
(original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test 
Wed Oct 10 21:06:27 2012
@@ -11,6 +11,8 @@ http://www.foo.com/foo.html;jsessionid=1
 
http://www.foo.com/foo.html?param=1another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED
 http://www.foo.com/foo.html?param=1another=2
 
http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1another=2
 http://www.foo.com/foo.html?param=1another=2
 http://www.foo.com/foo.php?x=1sid=xyzsomething=1 
http://www.foo.com/foo.php?x=1something=1
+# but NewsId is not a session id (NUTCH-706, NUTCH-1328)
+http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 
http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539
 
 # test removal default pages
 http://www.foo.com/home/index.html http://www.foo.com/home/

Modified: 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1396796r1=1396795r2=1396796view=diff
==
--- 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml 
(original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml 
Wed Oct 10 21:06:27 2012
@@ -13,7 +13,7 @@
 
 !-- removes session ids from urls (such as jsessionid and PHPSESSID) --
 regex
-  
pattern([;_]?((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern
+  
pattern([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern
   substitution$4/substitution
 /regex
 




svn commit: r1396817 - in /nutch/trunk: conf/regex-normalize.xml.template src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test src/plugin/urlnormalizer-regex/sample/regex-normalize-defau

2012-10-10 Thread snagel
Author: snagel
Date: Wed Oct 10 21:54:37 2012
New Revision: 1396817

URL: http://svn.apache.org/viewvc?rev=1396817view=rev
Log:
NUTCH-706 (applied correct patch)

Modified:
nutch/trunk/conf/regex-normalize.xml.template

nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test

nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml

Modified: nutch/trunk/conf/regex-normalize.xml.template
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1396817r1=1396816r2=1396817view=diff
==
--- nutch/trunk/conf/regex-normalize.xml.template (original)
+++ nutch/trunk/conf/regex-normalize.xml.template Wed Oct 10 21:54:37 2012
@@ -29,7 +29,7 @@
 
 !-- removes session ids from urls (such as jsessionid and PHPSESSID) --
 regex
-  
pattern([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern
+  
pattern(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern
   substitution$4/substitution
 /regex
 

Modified: 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1396817r1=1396816r2=1396817view=diff
==
--- 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test 
(original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test 
Wed Oct 10 21:54:37 2012
@@ -11,8 +11,13 @@ http://www.foo.com/foo.html;jsessionid=1
 
http://www.foo.com/foo.html?param=1another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED
 http://www.foo.com/foo.html?param=1another=2
 
http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1another=2
 http://www.foo.com/foo.html?param=1another=2
 http://www.foo.com/foo.php?x=1sid=xyzsomething=1 
http://www.foo.com/foo.php?x=1something=1
-# but NewsId is not a session id (NUTCH-706, NUTCH-1328)
+http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 
http://www.foo.com/foo.html
+http://www.foo.com/foo.php?_sessionid=qmyrcedtoutputformat=htmlpath=/3_images/foo
 http://www.foo.com/foo.php?outputformat=htmlpath=/3_images/foo
+http://www.foo.com/foo.php?_pid=2_spid=0lang=en_sessionid=e36902d5bb2d0d922fc24b43
 http://www.foo.com/foo.php?_pid=2_spid=0lang=en
+http://www.foo.com/foo.php?app=contentcontent=overviewlang=en_sid=587fba8f825b05844526519fdb7d75c8b=35m=47
 http://www.foo.com/foo.php?app=contentcontent=overviewlang=enb=35m=47
+# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328)
 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 
http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539
+http://www.foo.com/home.cfm?language=encountry=ukaddressid=250646pagingpos=0
 http://www.foo.com/home.cfm?language=encountry=ukaddressid=250646pagingpos=0
 
 # test removal default pages
 http://www.foo.com/home/index.html http://www.foo.com/home/

Modified: 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1396817r1=1396816r2=1396817view=diff
==
--- 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml 
(original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml 
Wed Oct 10 21:54:37 2012
@@ -13,7 +13,7 @@
 
 !-- removes session ids from urls (such as jsessionid and PHPSESSID) --
 regex
-  
pattern([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern
+  
pattern(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern
   substitution$4/substitution
 /regex
 




svn commit: r1401458 - /nutch/branches/2.x/CHANGES.txt

2012-10-23 Thread snagel
Author: snagel
Date: Tue Oct 23 20:47:16 2012
New Revision: 1401458

URL: http://svn.apache.org/viewvc?rev=1401458view=rev
Log:
NUTCH-1344 BasicURLNormalizer to normalize https same as http - forgot to add 
committer

Modified:
nutch/branches/2.x/CHANGES.txt

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1401458r1=1401457r2=1401458view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Oct 23 20:47:16 2012
@@ -8,7 +8,7 @@ Release 2.2 - Current Development
 
 * NUTCH-874 Make sure all plugins in src/plugin are compatible with Nutch 2.0 
and Gora (part 1) (Kiran Chitturi via lewismc)
 
-* NUTCH-1344 BasicURLNormalizer to normalize https same as http
+* NUTCH-1344 BasicURLNormalizer to normalize https same as http (snagel)
 
 * NUTCH-706 Url regex normalizer: pattern for session id removal not to match 
newsId (Meghna Kukreja via snagel)
 




svn commit: r1401459 - in /nutch/trunk: CHANGES.txt src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

2012-10-23 Thread snagel
Author: snagel
Date: Tue Oct 23 20:51:35 2012
New Revision: 1401459

URL: http://svn.apache.org/viewvc?rev=1401459view=rev
Log:
NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns

Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1401459r1=1401458r2=1401459view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct 23 20:51:35 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns 
(snagel)
+
 * NUTCH-1341 NotModified time set to now but page not modified (markus)
 
 * NUTCH-1215 UpdateDB should not require segment as input (markus)

Modified: 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1401459r1=1401458r2=1401459view=diff
==
--- 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 Tue Oct 23 20:51:35 2012
@@ -247,7 +247,14 @@ public class RegexURLNormalizer extends 
 }
 if (patternValue != null  subValue != null) {
   Rule rule = new Rule();
-  rule.pattern = Pattern.compile(patternValue);
+  try {
+rule.pattern = Pattern.compile(patternValue);
+  } catch (PatternSyntaxException e) {
+if (LOG.isErrorEnabled()) {
+  LOG.error(skipped rule:  + patternValue +  -  + subValue + 
 : invalid regular expression pattern:  + e);
+}
+continue;
+  }
   rule.substitution = subValue;
   rules.add(rule);
 }




svn commit: r1461854 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java src/java/org/apache/nutch/parse/ParserChecker.java

2013-03-27 Thread snagel
Author: snagel
Date: Wed Mar 27 21:31:42 2013
New Revision: 1461854

URL: http://svn.apache.org/r1461854
Log:
parsechecker and indexchecker to report truncated content

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1461854r1=1461853r2=1461854view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Mar 27 21:31:42 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)
+
 * NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel + 
lewismc)
 
 * NUTCH-1047 Pluggable indexing backends (jnioche)

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461854r1=1461853r2=1461854view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Wed Mar 27 21:31:42 2013
@@ -35,6 +35,7 @@ import org.apache.nutch.indexer.NutchDoc
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
@@ -105,6 +106,10 @@ public class IndexingFiltersChecker exte
 // store the guessed content type in the crawldatum
 datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new 
Text(contentType));
 
+if (ParseSegment.isTruncated(content)) {
+  LOG.warn(Content is truncated, parse may fail!);
+}
+
 if (LOG.isInfoEnabled()) {
   LOG.info(parsing:  + url);
   LOG.info(contentType:  + contentType);

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1461854r1=1461853r2=1461854view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed Mar 27 
21:31:42 2013
@@ -106,6 +106,10 @@ public class ParserChecker implements To
   return (-1);
 }
 
+if (ParseSegment.isTruncated(content)) {
+  LOG.warn(Content is truncated, parse may fail!);
+}
+
 ParseResult parseResult = new ParseUtil(conf).parse(content);
 
 // Calculate the signature




svn commit: r1461857 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java src/java/org/apache/nutch/parse/ParserChecker.java

2013-03-27 Thread snagel
Author: snagel
Date: Wed Mar 27 21:33:38 2013
New Revision: 1461857

URL: http://svn.apache.org/r1461857
Log:
parsechecker and indexchecker to report truncated content

Modified:
nutch/branches/2.x/CHANGES.txt

nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1461857r1=1461856r2=1461857view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Mar 27 21:33:38 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)
+
 * NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel 
via lewismc)
 
 * NUTCH-1038 Port IndexingFiltersChecker to 2.0 (snagel via lewismc)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461857r1=1461856r2=1461857view=diff
==
--- 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 (original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 Wed Mar 27 21:33:38 2013
@@ -28,6 +28,7 @@ import org.apache.hadoop.util.ToolRunner
 import org.apache.nutch.crawl.CrawlStatus;
 import org.apache.nutch.parse.ParseStatusUtils;
 import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParserJob;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
@@ -109,6 +110,10 @@ public class IndexingFiltersChecker exte
   LOG.info(contentType:  + contentType);
 }
 
+if (ParserJob.isTruncated(url, page)) {
+  LOG.warn(Content is truncated, parse may fail!);
+}
+
 (new ParseUtil(conf)).process(url, page);
 if (!ParseStatusUtils.isSuccess(page.getParseStatus())) {
   LOG.warn(Problem with parse - check log);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1461857r1=1461856r2=1461857view=diff
==
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Wed 
Mar 27 21:33:38 2013
@@ -121,6 +121,10 @@ public class ParserChecker implements To
 
 page.setContentType(new Utf8(contentType));
 
+if (ParserJob.isTruncated(url, page)) {
+  LOG.warn(Content is truncated, parse may fail!);
+}
+
 Parse parse = new ParseUtil(conf).parse(url, page);
 
 if (parse == null) {




svn commit: r1480484 - in /nutch/branches/2.x: CHANGES.txt conf/schema-solr4.xml conf/schema.xml src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

2013-05-08 Thread snagel
Author: snagel
Date: Wed May  8 22:04:04 2013
New Revision: 1480484

URL: http://svn.apache.org/r1480484
Log:
NUTCH-956 solrindex issues

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/schema-solr4.xml
nutch/branches/2.x/conf/schema.xml

nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1480484r1=1480483r2=1480484view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed May  8 22:04:04 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via 
lewismc, snagel)
+
 * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)
 
 * NUTCH-1514 Phase out the deprecated configuration properties (if possible) 
(tejasp)

Modified: nutch/branches/2.x/conf/schema-solr4.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1480484r1=1480483r2=1480484view=diff
==
--- nutch/branches/2.x/conf/schema-solr4.xml (original)
+++ nutch/branches/2.x/conf/schema-solr4.xml Wed May  8 22:04:04 2013
@@ -346,6 +346,9 @@
 
 !-- fields for creativecommons plugin --
 field name=cc type=string stored=true indexed=true 
multiValued=true/
+
+!-- fields for tld plugin --
+field name=tld type=string stored=false indexed=false/
  /fields
  uniqueKeyid/uniqueKey
  defaultSearchFieldtext/defaultSearchField

Modified: nutch/branches/2.x/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1480484r1=1480483r2=1480484view=diff
==
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Wed May  8 22:04:04 2013
@@ -114,6 +114,9 @@
 !-- fields for creativecommons plugin --
 field name=cc type=string stored=true indexed=true
 multiValued=true/
+
+!-- fields for tld plugin --
+field name=tld type=string stored=false indexed=false/
 /fields
 uniqueKeyid/uniqueKey
 defaultSearchFieldcontent/defaultSearchField

Modified: 
nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1480484r1=1480483r2=1480484view=diff
==
--- 
nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Wed May  8 22:04:04 2013
@@ -44,10 +44,12 @@ import org.slf4j.LoggerFactory;
 
 /**
  * Add (or reset) a few metaData properties as respective fields (if they are
- * available), so that they can be displayed by more.jsp (called by 
search.jsp).
+ * available), so that they can be accurately used within the search index.
  * 
- * content-type is indexed to support query by type: last-modifed is indexed to
- * support query by date:
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains 
content length from the HTTP
+ * header, 'type' field is indexed to support query by type and finally the 
'title' field is an attempt 
+ * to reset the title if a content-disposition hint exists. The logic is that 
such a presence is indicative 
+ * that the content provider wants the filename therein to be used as the 
title.
  * 
  * Still need to make content-length searchable!
  * 
@@ -171,7 +173,9 @@ public class MoreIndexingFilter implemen
*/
   private NutchDocument addType(NutchDocument doc, WebPage page, String url) {
 String mimeType = null;
-Utf8 contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));
+Utf8 contentType = page.getContentType();
+if (contentType == null)
+   contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));
 if (contentType == null) {
   // Note by Jerome Charron on 20050415:
   // Content Type not solved by a previous plugin
@@ -194,13 +198,11 @@ public class MoreIndexingFilter implemen
   return doc;
 }
 
-//String scontentType = mimeType.getName();
-
 doc.add(type, mimeType);
 
 // Check if we need to split the content type in sub parts
-if ( null != contentType  
conf.getBoolean(moreIndexingFilter.indexMimeTypeParts, true)) {
-  String[] parts = getParts(contentType.toString());
+if (conf.getBoolean(moreIndexingFilter.indexMimeTypeParts, true)) {
+  String[] parts = getParts

svn commit: r1480485 - in /nutch/trunk: CHANGES.txt conf/schema-solr4.xml conf/schema.xml src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

2013-05-08 Thread snagel
Author: snagel
Date: Wed May  8 22:04:53 2013
New Revision: 1480485

URL: http://svn.apache.org/r1480485
Log:
NUTCH-956 solrindex issues

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/schema-solr4.xml
nutch/trunk/conf/schema.xml

nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1480485r1=1480484r2=1480485view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May  8 22:04:53 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via 
lewismc, snagel)
+
 * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)
 
 * NUTCH-1514 Phase out the deprecated configuration properties (if possible) 
(tejasp)

Modified: nutch/trunk/conf/schema-solr4.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1480485r1=1480484r2=1480485view=diff
==
--- nutch/trunk/conf/schema-solr4.xml (original)
+++ nutch/trunk/conf/schema-solr4.xml Wed May  8 22:04:53 2013
@@ -345,6 +345,9 @@
 
 !-- fields for creativecommons plugin --
 field name=cc type=string stored=true indexed=true 
multiValued=true/
+
+!-- fields for tld plugin --
+field name=tld type=string stored=false indexed=false/
  /fields
  uniqueKeyid/uniqueKey
  defaultSearchFieldtext/defaultSearchField

Modified: nutch/trunk/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1480485r1=1480484r2=1480485view=diff
==
--- nutch/trunk/conf/schema.xml (original)
+++ nutch/trunk/conf/schema.xml Wed May  8 22:04:53 2013
@@ -114,6 +114,9 @@
 !-- fields for creativecommons plugin --
 field name=cc type=string stored=true indexed=true
 multiValued=true/
+
+!-- fields for tld plugin --
+field name=tld type=string stored=false indexed=false/
 /fields
 uniqueKeyid/uniqueKey
 defaultSearchFieldcontent/defaultSearchField

Modified: 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1480485r1=1480484r2=1480485view=diff
==
--- 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Wed May  8 22:04:53 2013
@@ -52,12 +52,13 @@ import org.apache.commons.lang.StringUti
 import org.apache.commons.lang.time.DateUtils;
 
 /**
- * Add (or reset) a few metaData properties as respective fields
- * (if they are available), so that they can be displayed by more.jsp
- * (called by search.jsp).
- *
- * content-type is indexed to support query by type:
- * last-modifed is indexed to support query by date:
+ * Add (or reset) a few metaData properties as respective fields (if they are
+ * available), so that they can be accurately used within the search index.
+ * 
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains 
content length from the HTTP
+ * header, 'type' field is indexed to support query by type and finally the 
'title' field is an attempt 
+ * to reset the title if a content-disposition hint exists. The logic is that 
such a presence is indicative 
+ * that the content provider wants the filename therein to be used as the 
title.
  *
  * Still need to make content-length searchable!
  *




svn commit: r1494776 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java

2013-06-19 Thread snagel
Author: snagel
Date: Wed Jun 19 21:26:07 2013
New Revision: 1494776

URL: http://svn.apache.org/r1494776
Log:
NUTCH-1245 URL gone with 404 after db.fetch.interval.max stays db_unfetched in 
CrawlDb and is generated over and over again

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1494776r1=1494775r2=1494776view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jun 19 21:26:07 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1245 URL gone with 404 after db.fetch.interval.max stays db_unfetched 
in CrawlDb (snagel)
+
 * NUTCH-1527 Elasticsearch indexer (lufeng + markus)
 
 * NUTCH-1475 Index-More Plugin -- A better fall back value for date field 
(James Sullivan, snagel via lewismc)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1494776r1=1494775r2=1494776view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Wed 
Jun 19 21:26:07 2013
@@ -85,9 +85,8 @@ public abstract class AbstractFetchSched
   
   /**
* This method specifies how to schedule refetching of pages
-   * marked as GONE. Default implementation increases fetchInterval by 50%,
-   * and if it exceeds the codemaxInterval/code it calls
-   * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
+   * marked as GONE. Default implementation increases fetchInterval by 50%
+   * but the value may never exceed codemaxInterval/code.
*
* @param url URL of the page.
*
@@ -102,9 +101,11 @@ public abstract class AbstractFetchSched
   long prevFetchTime, long prevModifiedTime, long fetchTime) {
 // no page is truly GONE ... just increase the interval by 50%
 // and try much later.
-datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
+if ((datum.getFetchInterval() * 1.5f)  maxInterval)
+  datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
+else
+  datum.setFetchInterval(maxInterval * 0.9f);
 datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
-if (maxInterval  datum.getFetchInterval()) forceRefetch(url, datum, 
false);
 return datum;
   }
   




svn commit: r1494785 - /nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

2013-06-19 Thread snagel
Author: snagel
Date: Wed Jun 19 22:22:00 2013
New Revision: 1494785

URL: http://svn.apache.org/r1494785
Log:
NUTCH-1475 (fix after fix) fill field date with fetch time (as before) if 
modified time is unset

Modified:

nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1494785r1=1494784r2=1494785view=diff
==
--- 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Wed Jun 19 22:22:00 2013
@@ -105,7 +105,7 @@ public class MoreIndexingFilter implemen
 if (time == -1) { // if no last-modified 
specified in HTTP header
   time = datum.getModifiedTime(); // use value in CrawlDatum
   if (time = 0) {// if also unset
-time = new Date().getTime();  // use current time
+time = datum.getFetchTime();  // use time the fetch took 
place (fetchTime of fetchDatum)
   }
 }
 




svn commit: r1497557 - in /nutch/trunk: ./ conf/ src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/ src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/

2013-06-27 Thread snagel
Author: snagel
Date: Thu Jun 27 20:16:22 2013
New Revision: 1497557

URL: http://svn.apache.org/r1497557
Log:
NUTCH-1580 index-static returns object instead of value for index.static

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml

nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java

nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1497557r1=1497556r2=1497557view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jun 27 20:16:22 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1580 index-static returns object instead of value for index.static 
(Antoinette, lewismc, snagel)
+
 * NUTCH-1126 JUnit test for urlfilter-prefix (Talat UYARER via markus)
 
 Apache Nutch 1.7 Release - 06/20/2013 (mm/dd/)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1497557r1=1497556r2=1497557view=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Jun 27 20:16:22 2013
@@ -1241,9 +1241,11 @@
   nameindex.static/name
   value/value
   description
-  A simple plugin called at indexing that adds fields with static data. 
-  You can specify a list of fieldname:fieldcontent per nutch job.
-  It can be useful when collections can't be created by urlpatterns, 
+  Used by plugin index-static to adds fields with static data at indexing 
time. 
+  You can specify a comma-separated list of fieldname:fieldcontent per Nutch 
job.
+  Each fieldcontent can have multiple values separated by space, e.g.,
+field1:value1.1 value1.2 value1.3,field2:value2.1 value2.2 ...
+  It can be useful when collections can't be created by URL patterns, 
   like in subcollection, but on a job-basis.
   /description
 /property

Modified: 
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java?rev=1497557r1=1497556r2=1497557view=diff
==
--- 
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
 (original)
+++ 
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
 Thu Jun 27 20:16:22 2013
@@ -57,7 +57,9 @@ public class StaticFieldIndexer implemen
 
 if (this.addStaticFields == true) {
   for (EntryString, String[] entry : this.fields.entrySet()) {
-doc.add(entry.getKey(), entry.getValue());
+for (String val : entry.getValue()) {
+  doc.add(entry.getKey(), val);
+}
   }
 }
 return doc;

Modified: 
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java?rev=1497557r1=1497556r2=1497557view=diff
==
--- 
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
 (original)
+++ 
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
 Thu Jun 27 20:16:22 2013
@@ -100,11 +100,11 @@ public class TestStaticFieldIndexerTest 
 assertNotNull(doc);
 assertFalse(test if doc is not empty, doc.getFieldNames().isEmpty());
 assertEquals(test if doc has 3 fields, 3, doc.getFieldNames().size());
-assertEquals(test if doc has field1, val1,
-((String[]) doc.getField(field1).getValues().get(0))[0]);
-assertEquals(test if doc has field2, val2,
-((String[]) doc.getField(field2).getValues().get(0))[0]);
-assertEquals(test if doc has field4, val4,
-((String[]) doc.getField(field4).getValues().get(0))[0]);
+assertTrue(test if doc has field1, doc.getField(field1).getValues()
+.contains(val1));
+assertTrue(test if doc has field2, doc.getField(field2).getValues()
+.contains(val2));
+assertTrue(test if doc has field4, doc.getField(field4).getValues()
+.contains(val4));
   }
 }




svn commit: r1507130 - in /nutch/trunk: CHANGES.txt conf/log4j.properties

2013-07-25 Thread snagel
Author: snagel
Date: Thu Jul 25 21:14:45 2013
New Revision: 1507130

URL: http://svn.apache.org/r1507130
Log:
NUTCH-1587 misspelled property threshold in conf/log4j.properties

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/log4j.properties

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1507130r1=1507129r2=1507130view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul 25 21:14:45 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1587 misspelled property threshold in conf/log4j.properties (snagel)
+
 * NUTCH-1604 ProtocolFactory not thread-safe (jnioche)
 
 * NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus)

Modified: nutch/trunk/conf/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1507130r1=1507129r2=1507130view=diff
==
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Thu Jul 25 21:14:45 2013
@@ -6,7 +6,7 @@ hadoop.log.file=hadoop.log
 log4j.rootLogger=INFO,DRFA
 
 # Logging Threshold
-log4j.threshhold=ALL
+log4j.threshold=ALL
 
 #special logging requirements for some commandline tools
 log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout




svn commit: r1507131 - in /nutch/branches/2.x: CHANGES.txt conf/log4j.properties

2013-07-25 Thread snagel
Author: snagel
Date: Thu Jul 25 21:15:02 2013
New Revision: 1507131

URL: http://svn.apache.org/r1507131
Log:
NUTCH-1587 misspelled property threshold in conf/log4j.properties

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/log4j.properties

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1507131r1=1507130r2=1507131view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Jul 25 21:15:02 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1587 misspelled property threshold in conf/log4j.properties (snagel)
+
 * NUTCH-1604 ProtocolFactory not thread-safe (jnioche)
 
 * NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus)

Modified: nutch/branches/2.x/conf/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/log4j.properties?rev=1507131r1=1507130r2=1507131view=diff
==
--- nutch/branches/2.x/conf/log4j.properties (original)
+++ nutch/branches/2.x/conf/log4j.properties Thu Jul 25 21:15:02 2013
@@ -21,7 +21,7 @@ hadoop.log.file=hadoop.log
 log4j.rootLogger=INFO,DRFA
 
 # Logging Threshold
-log4j.threshhold=ALL
+log4j.threshold=ALL
 
 #special logging requirements for some commandline tools
 log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout




svn commit: r1511479 - in /nutch/trunk: CHANGES.txt src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java

2013-08-07 Thread snagel
Author: snagel
Date: Wed Aug  7 20:44:01 2013
New Revision: 1511479

URL: http://svn.apache.org/r1511479
Log:
NUTCH-911 protocol-file to return proper protocol status for notmodified, gone, 
access_denied

Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1511479r1=1511478r2=1511479view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Aug  7 20:44:01 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-911 protocol-file to return proper protocol status (Peter Lundberg via 
snagel)
+
 * NUTCH-806 Merge CrawlDBScanner with CrawlDBReader (jnioche)
 
 * NUTCH-1587 misspelled property threshold in conf/log4j.properties (snagel)

Modified: 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1511479r1=1511478r2=1511479view=diff
==
--- 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 Wed Aug  7 20:44:01 2013
@@ -105,6 +105,15 @@ public class File implements Protocol {
 if (code == 200) {  // got a good response
   return new ProtocolOutput(response.toContent());  // 
return it
   
+} else if (code == 304) {   // got not modified
+  return new ProtocolOutput(response.toContent(), 
ProtocolStatus.STATUS_NOTMODIFIED);
+
+} else if (code == 401) {   // access denied / no read 
permissions
+  return new ProtocolOutput(response.toContent(), new 
ProtocolStatus(ProtocolStatus.ACCESS_DENIED));
+
+} else if (code == 404) {   // no such file
+  return new ProtocolOutput(response.toContent(), 
ProtocolStatus.STATUS_NOTFOUND);
+
 } else if (code = 300  code  400) { // handle redirect
   if (redirects == MAX_REDIRECTS)
 throw new FileException(Too many redirects:  + url);




svn commit: r1511496 - in /nutch/branches/2.x: CHANGES.txt src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java

2013-08-07 Thread snagel
Author: snagel
Date: Wed Aug  7 21:10:17 2013
New Revision: 1511496

URL: http://svn.apache.org/r1511496
Log:
NUTCH-911 protocol-file to return proper protocol status for notmodified, gone, 
access_denied

Modified:
nutch/branches/2.x/CHANGES.txt

nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1511496r1=1511495r2=1511496view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Aug  7 21:10:17 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-911 protocol-file to return proper protocol status (Peter Lundberg via 
snagel)
+
 * NUTCH-1587 misspelled property threshold in conf/log4j.properties (snagel)
 
 * NUTCH-1604 ProtocolFactory not thread-safe (jnioche)

Modified: 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1511496r1=1511495r2=1511496view=diff
==
--- 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 Wed Aug  7 21:10:17 2013
@@ -114,6 +114,16 @@ public class File implements Protocol {
 
 if (code == 200) { // got a good response
   return new ProtocolOutput(response.toContent()); // return it
+
+} else if (code == 304) { // got not modified
+  return new ProtocolOutput(response.toContent(), 
ProtocolStatusUtils.STATUS_NOTMODIFIED);
+
+} else if (code == 401) { // access denied / no read permissions
+  return new ProtocolOutput(response.toContent(), 
ProtocolStatusUtils.makeStatus(ProtocolStatusUtils.ACCESS_DENIED));
+
+} else if (code == 404) { // no such file
+  return new ProtocolOutput(response.toContent(), 
ProtocolStatusUtils.STATUS_NOTFOUND);
+
 } else if (code = 300  code  400) { // handle redirect
   if (redirects == MAX_REDIRECTS)
 throw new FileException(Too many redirects:  + url);




svn commit: r1544341 - /nutch/branches/2.x/src/test/log4j.properties

2013-11-21 Thread snagel
Author: snagel
Date: Thu Nov 21 22:04:13 2013
New Revision: 1544341

URL: http://svn.apache.org/r1544341
Log:
NUTCH-1587 misspelled property threshold in log4j.properties

Modified:
nutch/branches/2.x/src/test/log4j.properties

Modified: nutch/branches/2.x/src/test/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/log4j.properties?rev=1544341r1=1544340r2=1544341view=diff
==
--- nutch/branches/2.x/src/test/log4j.properties (original)
+++ nutch/branches/2.x/src/test/log4j.properties Thu Nov 21 22:04:13 2013
@@ -1,7 +1,7 @@
 # log4j configuration used during build and unit tests
 
 log4j.rootLogger=info,stdout
-log4j.threshhold=INFO
+log4j.threshold=INFO
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} 
(%F:%M(%L)) - %m%n




svn commit: r1544340 - /nutch/trunk/src/test/log4j.properties

2013-11-21 Thread snagel
Author: snagel
Date: Thu Nov 21 22:03:18 2013
New Revision: 1544340

URL: http://svn.apache.org/r1544340
Log:
NUTCH-1587 misspelled property threshold in log4j.properties

Modified:
nutch/trunk/src/test/log4j.properties

Modified: nutch/trunk/src/test/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/log4j.properties?rev=1544340r1=1544339r2=1544340view=diff
==
--- nutch/trunk/src/test/log4j.properties (original)
+++ nutch/trunk/src/test/log4j.properties Thu Nov 21 22:03:18 2013
@@ -1,7 +1,7 @@
 # log4j configuration used during build and unit tests
 
 log4j.rootLogger=info,stdout
-log4j.threshhold=ALL
+log4j.threshold=ALL
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} 
(%F:%M(%L)) - %m%n




svn commit: r1560512 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

2014-01-22 Thread snagel
Author: snagel
Date: Wed Jan 22 21:13:01 2014
New Revision: 1560512

URL: http://svn.apache.org/r1560512
Log:
NUTCH-1413 Record response time

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml

nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1560512r1=1560511r2=1560512view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 22 21:13:01 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1413 Record response time (Yasin Kılınç, Talat Uyarer, snagel)
+
 * NUTCH-1325 HostDB for Nutch (markus, tejasp)
 
 * NUTCH-1680 CrawlDbReader to dump minRetry value (markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1560512r1=1560511r2=1560512view=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Jan 22 21:13:01 2014
@@ -266,6 +266,16 @@
   /description
 /property
 
+property
+  namehttp.store.responsetime/name
+  valuetrue/value
+  descriptionEnables us to record the response time of the 
+  host which is the time period between start connection to end 
+  connection of a pages host. The response time in milliseconds
+  is stored in CrawlDb in CrawlDatum's meta data under key quot;_rs_quot;
+  /description
+/property
+
 !-- FTP properties --
 
 property

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1560512r1=1560511r2=1560512view=diff
==
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Wed Jan 22 21:13:01 2014
@@ -37,6 +37,7 @@ import org.apache.nutch.util.DeflateUtil
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 
 // crawler-commons imports
@@ -47,7 +48,8 @@ import crawlercommons.robots.BaseRobotRu
  */
 public abstract class HttpBase implements Protocol {
   
-  
+  public static final Text RESPONSE_TIME = new Text(_rs_);
+
   public static final int BUFFER_SIZE = 8 * 1024;
   
   private static final byte[] EMPTY_CONTENT = new byte[0];
@@ -92,6 +94,12 @@ public abstract class HttpBase implement
   
   /** Do we use HTTP/1.1? */
   protected boolean useHttp11 = false;
+
+  /**
+   * Record response time in CrawlDatum's meta data, see property
+   * http.store.responsetime.
+   */
+  protected boolean responseTime = true;
   
   /** Skip page if Crawl-Delay longer than this value. */
   protected long maxCrawlDelay = -1L;
@@ -123,6 +131,7 @@ public abstract class HttpBase implement
   this.accept = conf.get(http.accept, accept);
   // backward-compatible default setting
   this.useHttp11 = conf.getBoolean(http.useHttp11, false);
+  this.responseTime = conf.getBoolean(http.store.responsetime, true);
   this.robots.setConf(conf);
   logConf();
   }
@@ -137,8 +146,15 @@ public abstract class HttpBase implement
 String urlString = url.toString();
 try {
   URL u = new URL(urlString);
+  
+  long startTime = System.currentTimeMillis();
   Response response = getResponse(u, datum, false); // make a request
   
+  if(this.responseTime) {
+int elapsedTime = (int) (System.currentTimeMillis() - startTime);
+datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
+  }
+  
   int code = response.getCode();
   byte[] content = response.getContent();
   Content c = new Content(u.toString(), u.toString(),




svn commit: r1575350 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/NutchWritable.java

2014-03-07 Thread snagel
Author: snagel
Date: Fri Mar  7 18:13:20 2014
New Revision: 1575350

URL: http://svn.apache.org/r1575350
Log:
removed HostDB from Nutch 1.8 trunk: fix build, remove HostDb related entries 
from change log

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1575350r1=1575349r2=1575350view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Mar  7 18:13:20 2014
@@ -12,16 +12,12 @@ Nutch Development Trunk
 
 * NUTCH-1253 Incompatable neko and xerces versions (snagel, lewismc)
 
-* NUTCH-1717 HostDB not to complain if filters/normalizers are disabled 
(markus)
-
 * NUTCH-1715 RobotRulesParser adds additional '*' to the robots name (tejasp)
 
 * NUTCH-356 Plugin repository cache can lead to memory leak (Enrico Triolo, 
Doğacan Güney via markus)
 
 * NUTCH-1413 Record response time (Yasin Kılınç, Talat Uyarer, snagel)
 
-* NUTCH-1325 HostDB for Nutch (markus, tejasp)
-
 * NUTCH-1680 CrawlDbReader to dump minRetry value (markus)
 
 * NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid Yüzügüldü, snagel 
via lewismc)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=1575350r1=1575349r2=1575350view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Fri Mar  7 
18:13:20 2014
@@ -47,8 +47,7 @@ public class NutchWritable extends Gener
   org.apache.nutch.parse.ParseStatus.class,
   org.apache.nutch.protocol.Content.class,
   org.apache.nutch.protocol.ProtocolStatus.class,
-  org.apache.nutch.scoring.webgraph.LinkDatum.class,
-  org.apache.nutch.util.hostdb.HostDatum.class,
+  org.apache.nutch.scoring.webgraph.LinkDatum.class
 };
   }
 




svn commit: r1575351 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerMapReduce.java

2014-03-07 Thread snagel
Author: snagel
Date: Fri Mar  7 18:15:50 2014
New Revision: 1575351

URL: http://svn.apache.org/r1575351
Log:
NUTCH-1706 IndexerMapReduce does not remove db_redir_temp

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1575351r1=1575350r2=1575351view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Mar  7 18:15:50 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1706 IndexerMapReduce does not remove db_redir_temp (markus, snagel)
+
 * NUTCH-1113 SegmentMerger can now be safely used to merge segments (Edward 
Drapkin, markus, snagel)
 
 * NUTCH-1729 Upgrade to Tika 1.5 (jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1575351r1=1575350r2=1575351view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Mar 
 7 18:15:50 2014
@@ -180,36 +180,10 @@ implements MapperText, Writable, Text, 
   dbDatum = datum;
 }
 else if (CrawlDatum.hasFetchStatus(datum)) {
-
   // don't index unmodified (empty) pages
   if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
 fetchDatum = datum;
-
-/**
- * Check if we need to delete 404 NOT FOUND and 301 PERMANENT 
REDIRECT.
- */
-if (delete) {
-  if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || 
dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
-reporter.incrCounter(IndexerStatus, Documents deleted, 1);
-
-NutchIndexAction action = new NutchIndexAction(null, 
NutchIndexAction.DELETE);
-output.collect(key, action);
-return;
-  }
-  if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM 
||
-  fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP 
||
-  dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
-  dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
-reporter.incrCounter(IndexerStatus, Deleted redirects, 1);
-reporter.incrCounter(IndexerStatus, Perm redirects 
deleted, 1);
-
-NutchIndexAction action = new NutchIndexAction(null, 
NutchIndexAction.DELETE);
-output.collect(key, action);
-return;
-  }
-}
   }
-
 } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
@@ -239,6 +213,29 @@ implements MapperText, Writable, Text, 
 LOG.warn(Unrecognized type: +value.getClass());
   }
 }
+
+// Whether to delete GONE or REDIRECTS
+if (delete  fetchDatum != null  dbDatum != null) {
+  if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || 
dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
+reporter.incrCounter(IndexerStatus, Documents deleted, 1);
+
+NutchIndexAction action = new NutchIndexAction(null, 
NutchIndexAction.DELETE);
+output.collect(key, action);
+return;
+  }
+  
+  if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
+  fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
+  dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
+  dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+reporter.incrCounter(IndexerStatus, Deleted redirects, 1);
+reporter.incrCounter(IndexerStatus, Perm redirects deleted, 1);
+
+NutchIndexAction action = new NutchIndexAction(null, 
NutchIndexAction.DELETE);
+output.collect(key, action);
+return;
+  }
+}
 
 if (fetchDatum == null || dbDatum == null
 || parseText == null || parseData == null) {




svn commit: r1578620 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

2014-03-17 Thread snagel
Author: snagel
Date: Mon Mar 17 21:56:32 2014
New Revision: 1578620

URL: http://svn.apache.org/r1578620
Log:
NUTCH-1671 indexchecker to add digest field

Modified:
nutch/branches/2.x/CHANGES.txt

nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1578620r1=1578619r2=1578620view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Mar 17 21:56:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1671 indexchecker to add digest field (snagel, lufeng)
+
 * NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class (Yasin 
Kılınç, lufeng, Sertac TURKEL via snagel)
 
 * NUTCH-1478 Parse-metatags and index-metadata plugin for Nutch 2.x series 
(kiran, Nguyen Manh Tien, Talat UYARER, Vangelis Karvounis via lewismc)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1578620r1=1578619r2=1578620view=diff
==
--- 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 (original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 Mon Mar 17 21:56:32 2014
@@ -37,6 +37,7 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.protocol.ProtocolStatusUtils;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.URLUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -122,6 +123,7 @@ public class IndexingFiltersChecker exte
 }
 
 NutchDocument doc = new NutchDocument();
+doc.add(digest, StringUtil.toHexString(page.getSignature()));
 
 try {
   doc = indexers.filter(doc, url, page);




svn commit: r1580046 - in /nutch/trunk: CHANGES.txt src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser

2014-03-21 Thread snagel
Author: snagel
Date: Fri Mar 21 20:56:13 2014
New Revision: 1580046

URL: http://svn.apache.org/r1580046
Log:
NUTCH-1733 parse-html to support HTML5 charset definitions

Added:

nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
   (with props)
Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1580046r1=1580045r2=1580046view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Mar 21 20:56:13 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1733 parse-html to support HTML5 charset definitions (snagel)
+
 * NUTCH-1671 indexchecker to add digest field (snagel, lufeng)
 
 Nutch 1.8  - 11/03/2014 (dd/mm/)

Modified: 
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1580046r1=1580045r2=1580046view=diff
==
--- 
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 Fri Mar 21 20:56:13 2014
@@ -56,6 +56,9 @@ public class HtmlParser implements Parse
   private static Pattern charsetPattern =
 Pattern.compile(charset=\\s*([a-z][_\\-0-9a-z]*),
 Pattern.CASE_INSENSITIVE);
+  private static Pattern charsetPatternHTML5 =
+  
Pattern.compile(meta\\s+charset\\s*=\\s*[\']?([a-z][_\\-0-9a-z]*)[^]*,
+  Pattern.CASE_INSENSITIVE);
   
   private String parserImpl;
 
@@ -64,13 +67,13 @@ public class HtmlParser implements Parse
* emunknown/em encoding,  read out 'charset' parameter in the meta tag  
 
* from the first codeCHUNK_SIZE/code bytes.
* If there's no meta tag for Content-Type or no charset is specified,
+   * the content is checked for a Unicode Byte Order Mark (BOM).
+   * This will also cover non-byte oriented character encodings (UTF-16 only).
+   * If no character set can be determined,
* codenull/code is returned.  br /
-   * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
-   * can't be handled with this. 
-   * We need to do something similar to what's done by mozilla
-   * 
(http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
-   * See also http://www.w3.org/TR/REC-xml/#sec-guessing
-   * br /
+   * See also 
http://www.w3.org/International/questions/qa-html-encoding-declarations,
+   * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
+   * http://www.w3.org/TR/REC-xml/#sec-guessing
*
* @param content codebyte[]/code representation of an html file
*/
@@ -99,6 +102,30 @@ public class HtmlParser implements Parse
   if (charsetMatcher.find()) 
 encoding = new String(charsetMatcher.group(1));
 }
+if (encoding == null) {
+  // check for HTML5 meta charset
+  metaMatcher = charsetPatternHTML5.matcher(str);
+  if (metaMatcher.find()) {
+encoding = new String(metaMatcher.group(1));
+  }
+}
+if (encoding == null) {
+  // check for BOM
+  if (content.length = 3
+   content[0] == (byte) 0xEF
+   content[1] == (byte) 0xBB
+   content[2] == (byte) 0xBF) {
+encoding = UTF-8;
+  } else if (content.length = 2) {
+if (content[0] == (byte)0xFF
+ content[1] == (byte)0xFE) {
+  encoding = UTF-16LE;
+} else if (content[0] == (byte)0xFE
+ content[1] == (byte)0xFF) {
+  encoding = UTF-16BE;
+}
+  }
+}
 
 return encoding;
   }

Added: 
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1580046view=auto
==
--- 
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
 (added)
+++ 
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
 Fri Mar 21 20:56:13 2014
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy

svn commit: r1580270 - in /nutch/site: forrest/src/documentation/content/xdocs/downloads.xml publish/downloads.html

2014-03-22 Thread snagel
Author: snagel
Date: Sat Mar 22 18:04:10 2014
New Revision: 1580270

URL: http://svn.apache.org/r1580270
Log:
NUTCH-1742 update remaining references of 1.7 - 1.8

Modified:
nutch/site/forrest/src/documentation/content/xdocs/downloads.xml
nutch/site/publish/downloads.html

Modified: nutch/site/forrest/src/documentation/content/xdocs/downloads.xml
URL: 
http://svn.apache.org/viewvc/nutch/site/forrest/src/documentation/content/xdocs/downloads.xml?rev=1580270r1=1580269r2=1580270view=diff
==
--- nutch/site/forrest/src/documentation/content/xdocs/downloads.xml (original)
+++ nutch/site/forrest/src/documentation/content/xdocs/downloads.xml Sat Mar 22 
18:04:10 2014
@@ -30,10 +30,10 @@
 section
   titleDownload/title
   
-  p Apache Nutch 2.2.1 (src-tar and src-zip only) and 1.7 (src-tar, 
src-zip, bin-tar and bin-zip) are now available. See 
+  p Apache Nutch 2.2.1 (src-tar and src-zip only) and 1.8 (src-tar, 
src-zip, bin-tar and bin-zip) are now available. See 
   the 
   a 
href=http://apache.org/dist/nutch/2.2.1/CHANGES-2.2.1.txt;CHANGES-2.2.1.txt/a,
 and 
-  a 
href=http://apache.org/dist/nutch/1.8/CHANGES.txt;CHANGES-1.7.txt/a
+  a 
href=http://apache.org/dist/nutch/1.8/CHANGES.txt;CHANGES-1.8.txt/a
   files for more information on the list of updates in these releases.
   /p
   p All Apache Nutch distributions is distributed under the a 
href=http://www.apache.org/licenses/LICENSE-2.0.html;Apache License, version 
2.0/a.
@@ -61,7 +61,7 @@
apache-nutch-1.8-src.zip.md5/a/tdtda 
href=http://apache.org/dist/nutch/1.8/apache-nutch-1.8-src.zip.asc;
apache-nutch-1.8-src.zip.asc/a /td/tr
   trtdApache Nutch 1.8 (bin.tar.gz)/tdtda 
href=http://www.apache.org/dyn/closer.cgi/nutch/1.8/apache-nutch-1.8-bin.tar.gz;
-   apache-nutch-1.8-bin.tar.gz/a/td tda 
href=http://apache.org/dist/nutch/1.7/apache-nutch-1.8-bin.tar.gz.md5;
+   apache-nutch-1.8-bin.tar.gz/a/td tda 
href=http://apache.org/dist/nutch/1.8/apache-nutch-1.8-bin.tar.gz.md5;
apache-nutch-1.8-bin.tar.gz.md5/a /td tda 
href=http://apache.org/dist/nutch/1.8/apache-nutch-1.8-bin.tar.gz.asc;
apache-nutch-1.8-bin.tar.gz.asc/a /td/tr
   trtdApache Nutch 1.8 (bin.zip)/tdtda 
href=http://www.apache.org/dyn/closer.cgi/nutch/1.8/apache-nutch-1.8-bin.zip;

Modified: nutch/site/publish/downloads.html
URL: 
http://svn.apache.org/viewvc/nutch/site/publish/downloads.html?rev=1580270r1=1580269r2=1580270view=diff
==
--- nutch/site/publish/downloads.html (original)
+++ nutch/site/publish/downloads.html Sat Mar 22 18:04:10 2014
@@ -272,10 +272,10 @@ document.write(Last Published:  + docu
 a name=N1000E/aa name=Download/a
 h2 class=h3Download/h2
 div class=section
-p Apache Nutch 2.2.1 (src-tar and src-zip only) and 1.7 (src-tar, src-zip, 
bin-tar and bin-zip) are now available. See 
+p Apache Nutch 2.2.1 (src-tar and src-zip only) and 1.8 (src-tar, src-zip, 
bin-tar and bin-zip) are now available. See 
   the 
   a 
href=http://apache.org/dist/nutch/2.2.1/CHANGES-2.2.1.txt;CHANGES-2.2.1.txt/a,
 and 
-  a 
href=http://apache.org/dist/nutch/1.8/CHANGES.txt;CHANGES-1.7.txt/a
+  a 
href=http://apache.org/dist/nutch/1.8/CHANGES.txt;CHANGES-1.8.txt/a
   files for more information on the list of updates in these releases.
   /p
 p All Apache Nutch distributions is distributed under the a 
href=http://www.apache.org/licenses/LICENSE-2.0.html;Apache License, version 
2.0/a.
@@ -320,7 +320,7 @@ document.write(Last Published:  + docu
   
 tr
 td colspan=1 rowspan=1Apache Nutch 1.8 (bin.tar.gz)/tdtd colspan=1 
rowspan=1a 
href=http://www.apache.org/dyn/closer.cgi/nutch/1.8/apache-nutch-1.8-bin.tar.gz;
-   apache-nutch-1.8-bin.tar.gz/a/td td colspan=1 rowspan=1a 
href=http://apache.org/dist/nutch/1.7/apache-nutch-1.8-bin.tar.gz.md5;
+   apache-nutch-1.8-bin.tar.gz/a/td td colspan=1 rowspan=1a 
href=http://apache.org/dist/nutch/1.8/apache-nutch-1.8-bin.tar.gz.md5;
apache-nutch-1.8-bin.tar.gz.md5/a /td td colspan=1 
rowspan=1a 
href=http://apache.org/dist/nutch/1.8/apache-nutch-1.8-bin.tar.gz.asc;
apache-nutch-1.8-bin.tar.gz.asc/a /td
 /tr




svn commit: r4777 - /release/nutch/1.7/

2014-03-22 Thread snagel
Author: snagel
Date: Sat Mar 22 18:13:52 2014
New Revision: 4777

Log:
NUTCH-1742 removed 1.7 packages from svn (svnpubsub)

Removed:
release/nutch/1.7/



svn commit: r1583193 - in /nutch/trunk: CHANGES.txt src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java

2014-03-30 Thread snagel
Author: snagel
Date: Sun Mar 30 19:58:59 2014
New Revision: 1583193

URL: http://svn.apache.org/r1583193
Log:
NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class

Added:
nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java  
 (with props)
Modified:
nutch/trunk/CHANGES.txt

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1583193r1=1583192r2=1583193view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Mar 30 19:58:59 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class (Yasin 
Kılınç, lufeng, Sertac TURKEL via snagel)
+
 * NUTCH-1737 Upgrade to recent JUnit 4.x (lewismc)
 
 * NUTCH-1733 parse-html to support HTML5 charset definitions (snagel)

Added: 
nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java?rev=1583193view=auto
==
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java 
(added)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java 
Sun Mar 30 19:58:59 2014
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for AdaptiveFetchSchedule.
+ * 
+ */
+public class TestAdaptiveFetchSchedule extends TestCase {
+
+  private float inc_rate;
+  private float dec_rate;
+  private Configuration conf;
+  private long curTime, lastModified;
+  private int changed, interval, calculateInterval;
+
+  @Before
+  public void setUp() throws Exception {
+super.setUp();
+conf = NutchConfiguration.create();
+inc_rate = conf.getFloat(db.fetch.schedule.adaptive.inc_rate, 0.2f);
+dec_rate = conf.getFloat(db.fetch.schedule.adaptive.dec_rate, 0.2f);
+interval = 100;
+lastModified = 0;
+  }
+
+  /**
+   * Test the core functionality of AdaptiveFetchSchedule.
+   * 
+   */
+
+  @Test
+  public void testAdaptiveFetchSchedule() {
+
+FetchSchedule fs = new AdaptiveFetchSchedule();
+fs.setConf(conf);
+
+CrawlDatum p = prepareCrawlDatum();
+Text url = new Text(http://www.example.com;);
+
+changed = FetchSchedule.STATUS_UNKNOWN;
+fs.setFetchSchedule(url, p, p.getFetchTime(),
+p.getModifiedTime(), curTime, lastModified, changed);
+validateFetchInterval(changed, p.getFetchInterval());
+
+changed = FetchSchedule.STATUS_MODIFIED;
+fs.setFetchSchedule(url, p, p.getFetchTime(),
+p.getModifiedTime(), curTime, lastModified, changed);
+validateFetchInterval(changed, p.getFetchInterval());
+p.setFetchInterval(interval);
+
+changed = FetchSchedule.STATUS_NOTMODIFIED;
+fs.setFetchSchedule(url, p, p.getFetchTime(),
+p.getModifiedTime(), curTime, lastModified, changed);
+validateFetchInterval(changed, p.getFetchInterval());
+
+  }
+
+  /**
+   * Prepare a CrawlDatum (STATUS_DB_UNFETCHED) to Test AdaptiveFetchSchedule.
+   * 
+   * @return properly initialized CrawlDatum
+   */
+  public CrawlDatum prepareCrawlDatum() {
+CrawlDatum p = new CrawlDatum();
+p.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+p.setFetchInterval(interval);
+p.setScore(1.0f);
+p.setFetchTime(0);
+return p;
+  }
+
+  /**
+   * 
+   * The Method validates interval values according to changed parameter.
+   * 
+   * @param changed
+   *  status value to check calculated interval value.
+   * @param getInterval
+   *  to test IntervalValue from CrawlDatum which is calculated via
+   *  AdaptiveFetchSchedule algorithm.
+   */
+  private void validateFetchInterval(int changed, int getInterval) {
+
+if (changed == FetchSchedule.STATUS_UNKNOWN) {
+  assertEquals

svn commit: r1585144 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java

2014-04-05 Thread snagel
Author: snagel
Date: Sat Apr  5 17:06:04 2014
New Revision: 1585144

URL: http://svn.apache.org/r1585144
Log:
NUTCH-1735 code dedup fetcher queue redirects

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1585144r1=1585143r2=1585144view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr  5 17:06:04 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1735 code dedup fetcher queue redirects (snagel)
+
 * NUTCH-1745 Upgrade to ElasticSearch 1.1.0 (jnioche)
 
 * NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class (Yasin 
Kılınç, lufeng, Sertac TURKEL via snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1585144r1=1585143r2=1585144view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Apr  5 
17:06:04 2014
@@ -731,25 +731,7 @@ public class Fetcher extends Configured 
refreshTime  Fetcher.PERM_REFRESH_TIME,
Fetcher.CONTENT_REDIR);
   if (redirUrl != null) {
-CrawlDatum newDatum = new 
CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
-fit.datum.getFetchInterval(), fit.datum.getScore());
-// transfer existing metadata to the redir
-newDatum.getMetaData().putAll(fit.datum.getMetaData());
-scfilters.initialScore(redirUrl, newDatum);
-if (reprUrl != null) {
-  newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
-  new Text(reprUrl));
-}
-fit = FetchItem.create(redirUrl, newDatum, queueMode);
-if (fit != null) {
-  FetchItemQueue fiq =
-fetchQueues.getFetchItemQueue(fit.queueID);
-  fiq.addInProgressFetchItem(fit);
-} else {
-  // stop redirecting
-  redirecting = false;
-  reporter.incrCounter(FetcherStatus, 
FetchItem.notCreated.redirect, 1);
-}
+queueRedirect(redirUrl, fit);
   }
 }
 break;
@@ -772,25 +754,7 @@ public class Fetcher extends Configured 
  urlString, newUrl, temp,
  Fetcher.PROTOCOL_REDIR);
 if (redirUrl != null) {
-  CrawlDatum newDatum = new 
CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
-  fit.datum.getFetchInterval(), fit.datum.getScore());
-  // transfer existing metadata
-  newDatum.getMetaData().putAll(fit.datum.getMetaData());
-  scfilters.initialScore(redirUrl, newDatum);
-  if (reprUrl != null) {
-newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
-new Text(reprUrl));
-  }
-  fit = FetchItem.create(redirUrl, newDatum, queueMode);
-  if (fit != null) {
-FetchItemQueue fiq =
-  fetchQueues.getFetchItemQueue(fit.queueID);
-fiq.addInProgressFetchItem(fit);
-  } else {
-// stop redirecting
-redirecting = false;
-reporter.incrCounter(FetcherStatus, 
FetchItem.notCreated.redirect, 1);
-  }
+  queueRedirect(redirUrl, fit);
 } else {
   // stop redirecting
   redirecting = false;
@@ -918,6 +882,28 @@ public class Fetcher extends Configured 
   }
 }
 
+private void queueRedirect(Text redirUrl, FetchItem fit) throws 
ScoringFilterException {
+  CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
+  fit.datum.getFetchInterval(), fit.datum.getScore());
+  // transfer all existing metadata to the redirect
+  newDatum.getMetaData().putAll(fit.datum.getMetaData());
+  scfilters.initialScore(redirUrl, newDatum);
+  if (reprUrl != null) {
+newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+new Text(reprUrl));
+  }
+  fit = FetchItem.create(redirUrl, newDatum, queueMode);
+  if (fit != null) {
+FetchItemQueue fiq =
+  fetchQueues.getFetchItemQueue(fit.queueID);
+fiq.addInProgressFetchItem

svn commit: r1590315 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReader.java

2014-04-26 Thread snagel
Author: snagel
Date: Sat Apr 26 22:12:46 2014
New Revision: 1590315

URL: http://svn.apache.org/r1590315
Log:
NUTCH-1764 readdb to show command-line help if no action (-stats, -dump, etc.) 
given

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1590315r1=1590314r2=1590315view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 26 22:12:46 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1764 readdb to show command-line help if no action (-stats, -dump, 
etc.) given (Diaa via snagel)
+
 * NUTCH-1700 Remove deprecated code from creativecommons plugin (lewismc)
 
 * NUTCH-1761 Crawl script fails to find job file if not started from inside 
bin dir (David Hosking, jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1590315r1=1590314r2=1590315view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Sat Apr 26 
22:12:46 2014
@@ -542,7 +542,7 @@ public class CrawlDbReader implements Cl
   public static void main(String[] args) throws IOException {
 CrawlDbReader dbr = new CrawlDbReader();
 
-if (args.length  1) {
+if (args.length  2) {
   System.err.println(Usage: CrawlDbReader crawldb (-stats | -dump 
out_dir | -topN  out_dir [min] | -url url));
   System.err.println(\tcrawldb\tdirectory name where crawldb is 
located);
   System.err.println(\t-stats [-sort] \tprint overall statistics to 
System.out);




svn commit: r1592414 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/fetcher/FetcherReducer.java

2014-05-04 Thread snagel
Author: snagel
Date: Sun May  4 20:18:50 2014
New Revision: 1592414

URL: http://svn.apache.org/r1592414
Log:
NUTCH-1182 fetcher to log hung threads

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1592414r1=1592413r2=1592414view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sun May  4 20:18:50 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1182 fetcher to log hung threads (snagel)
+
 * NUTCH-1618 Turn speculative execution off for Fetching (talat)
 
 * NUTCH-1657 ORIGINAL_CHAR_ENCODING and CHAR_ENCODING_FOR_CONVERSION never set 
in HTMLParser (talat)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1592414r1=1592413r2=1592414view=diff
==
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java 
Sun May  4 20:18:50 2014
@@ -871,7 +871,24 @@ extends GoraReducerIntWritable, FetchEn
   
   // some requests seem to hang, despite all intentions
   if ((System.currentTimeMillis() - lastRequestStart.get())  timeout) {
-LOG.warn(Aborting with  + activeThreads +  hung threads.);
+if (LOG.isWarnEnabled()  activeThreads.get()  0) {
+  LOG.warn(Aborting with  + activeThreads +  hung threads.);
+  for (int i = 0; i  fetcherThreads.size(); i++) {
+FetcherThread thread = fetcherThreads.get(i);
+if (thread.isAlive()) {
+  LOG.warn(Thread # + i +  hung while processing  + 
thread.reprUrl);
+  if (LOG.isDebugEnabled()) {
+StackTraceElement[] stack = thread.getStackTrace();
+StringBuilder sb = new StringBuilder();
+sb.append(Stack of thread #).append(i).append(:\n);
+for (StackTraceElement s : stack) {
+  sb.append(s.toString()).append('\n');
+}
+LOG.debug(sb.toString());
+  }
+}
+  }
+}
 return;
   }
 




svn commit: r1594071 - in /nutch: branches/2.x/ branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ trunk/ trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/a

2014-05-12 Thread snagel
Author: snagel
Date: Mon May 12 19:39:43 2014
New Revision: 1594071

URL: http://svn.apache.org/r1594071
Log:
NUTCH-1752 Cache robots.txt rules per protocol:host:port

Modified:
nutch/branches/2.x/CHANGES.txt

nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1594071r1=1594070r2=1594071view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon May 12 19:39:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel)
+
 * NUTCH-1613 Timeouts in protocol-httpclient when crawling same host with 2 
threads (brian44 via jnioche)
 
 * NUTCH-1182 fetcher to log hung threads (snagel)

Modified: 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1594071r1=1594070r2=1594071view=diff
==
--- 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 Mon May 12 19:39:43 2014
@@ -48,23 +48,38 @@ public class HttpRobotRulesParser extend
 allowForbidden = conf.getBoolean(http.robots.403.allow, false);
   }
 
+  /** Compose unique key to store and access robot rules in cache for given 
URL */
+  protected static String getCacheKey(URL url) {
+String protocol = url.getProtocol().toLowerCase();  // normalize to lower 
case
+String host = url.getHost().toLowerCase();  // normalize to lower 
case
+int port = url.getPort();
+if (port == -1) {
+  port = url.getDefaultPort();
+}
+   /* Robot rules apply only to host, protocol, and port where robots.txt is
+* hosted (cf. NUTCH-1752). Consequently  */
+String cacheKey = protocol + : + host + : + port;
+return cacheKey;
+  }
+
   /**
-   * The hosts for which the caching of robots rules is yet to be done,
-   * it sends a Http request to the host corresponding to the {@link URL} 
-   * passed, gets robots file, parses the rules and caches the rules object
-   * to avoid re-work in future.
+   * Get the rules from robots.txt which applies for the given {@code url}.
+   * Robot rules are cached for a unique combination of host, protocol, and
+   * port. If no rules are found in the cache, a HTTP request is send to fetch
+   * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
+   * rules are cached to avoid re-fetching and re-parsing it again.
* 
-   *  @param http The {@link Protocol} object
-   *  @param url URL 
-   *  
-   *  @return robotRules A {@link BaseRobotRules} object for the rules
+   * @param http
+   *  The {@link Protocol} object
+   * @param url
+   *  URL robots.txt applies to
+   *
+   * @return {@link BaseRobotRules} holding the rules from robots.txt
*/
   public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
 
-String protocol = url.getProtocol().toLowerCase();  // normalize to lower 
case
-String host = url.getHost().toLowerCase();  // normalize to lower 
case
-
-BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + : + 
host);
+String cacheKey = getCacheKey(url);
+BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
 
 boolean cacheRule = true;
 
@@ -114,10 +129,10 @@ public class HttpRobotRulesParser extend
   }
 
   if (cacheRule) {
-CACHE.put(protocol + : + host, robotRules);  // cache rules for host
-if (redir != null  !redir.getHost().equals(host)) {
+CACHE.put(cacheKey, robotRules);  // cache rules for host
+if (redir != null  !redir.getHost().equalsIgnoreCase(url.getHost())) 
{
   // cache also for the redirected host
-  CACHE.put(protocol + : + redir.getHost(), robotRules);
+  CACHE.put(getCacheKey(redir), robotRules);
 }
   }
 }

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1594071r1=1594070r2=1594071view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon May 12 19:39:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel

svn commit: r1593595 - in /nutch/site: forrest/src/documentation/content/xdocs/index.xml publish/index.html

2014-05-15 Thread snagel
Author: snagel
Date: Fri May  9 18:48:29 2014
New Revision: 1593595

URL: http://svn.apache.org/r1593595
Log:
Nutch 1.8 includes Tika 1.5

Modified:
nutch/site/forrest/src/documentation/content/xdocs/index.xml
nutch/site/publish/index.html

Modified: nutch/site/forrest/src/documentation/content/xdocs/index.xml
URL: 
http://svn.apache.org/viewvc/nutch/site/forrest/src/documentation/content/xdocs/index.xml?rev=1593595r1=1593594r2=1593595view=diff
==
--- nutch/site/forrest/src/documentation/content/xdocs/index.xml (original)
+++ nutch/site/forrest/src/documentation/content/xdocs/index.xml Fri May  9 
18:48:29 2014
@@ -86,7 +86,7 @@
  pThe Apache Nutch PMC are pleased to announce the immediate release of 
Apache Nutch v1.8, we advise all
  current users and developers of the 1.X series to upgrade to this 
release. Alhough this
  release includes library upgrades to a 
href=http://code.google.com/p/crawler-commons/;Crawler Commons/a 0.3 and
- a href=http://tika.apache.org;Apache Tika/a 1.4, it also provides 
over 30 bug fixes as well as 18 improvements.
+ a href=http://tika.apache.org;Apache Tika/a 1.5, it also provides 
over 30 bug fixes as well as 18 improvements.
  Please see the a 
href=http://www.apache.org/dist/nutch/1.8/CHANGES.txt;list of changes/a for 
a full
  breakdown, or see the a href=http://s.apache.org/oHY;release 
report/a.
  As usual in the 1.X series, this release is made available both as source 
and binary. Additionally developers

Modified: nutch/site/publish/index.html
URL: 
http://svn.apache.org/viewvc/nutch/site/publish/index.html?rev=1593595r1=1593594r2=1593595view=diff
==
--- nutch/site/publish/index.html (original)
+++ nutch/site/publish/index.html Fri May  9 18:48:29 2014
@@ -443,7 +443,7 @@ document.write(Last Published:  + docu
 pThe Apache Nutch PMC are pleased to announce the immediate release of 
Apache Nutch v1.8, we advise all
  current users and developers of the 1.X series to upgrade to this 
release. Alhough this
  release includes library upgrades to a 
href=http://code.google.com/p/crawler-commons/;Crawler Commons/a 0.3 and
- a href=http://tika.apache.org;Apache Tika/a 1.4, it also provides 
over 30 bug fixes as well as 18 improvements.
+ a href=http://tika.apache.org;Apache Tika/a 1.5, it also provides 
over 30 bug fixes as well as 18 improvements.
  Please see the a 
href=http://www.apache.org/dist/nutch/1.8/CHANGES.txt;list of changes/a for 
a full
  breakdown, or see the a href=http://s.apache.org/oHY;release 
report/a.
  As usual in the 1.X series, this release is made available both as source 
and binary. Additionally developers




svn commit: r1604291 - in /nutch: branches/2.x/ branches/2.x/conf/ branches/2.x/src/java/org/apache/nutch/fetcher/ branches/2.x/src/java/org/apache/nutch/protocol/ trunk/ trunk/conf/ trunk/src/java/or

2014-06-20 Thread snagel
Author: snagel
Date: Fri Jun 20 22:15:43 2014
New Revision: 1604291

URL: http://svn.apache.org/r1604291
Log:
NUTCH-1718 redefine http.robots.agent as additional agent names

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1604291r1=1604290r2=1604291view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Jun 20 22:15:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1718 redefine http.robots.agent as additional agent names (snagel, 
Tejas Patil, Daniel Kugel)
+
 * NUTCH-1796 Ensure Gora object builders are used as oppose to empty 
constructors (snagel via lewismc)
 
 * NUTCH-1590 [SECURITY] Frame injection vulnerability in published Javadoc 
(jnioche)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1604291r1=1604290r2=1604291view=diff
==
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Fri Jun 20 22:15:43 2014
@@ -90,11 +90,18 @@
 
 property
   namehttp.robots.agents/name
-  value*/value
-  descriptionThe agent strings we'll look for in robots.txt files,
-  comma-separated, in decreasing order of precedence. You should
-  put the value of http.agent.name as the first agent name, and keep the
-  default * at the end of the list. E.g.: BlurflDev,Blurfl,*
+  value/value
+  descriptionAny other agents, apart from 'http.agent.name', that the robots
+  parser would look for in robots.txt. Multiple agents can be provided using 
+  comma as a delimiter. eg. mybot,foo-spider,bar-crawler
+  
+  The ordering of agents does NOT matter and the robots parser would make 
+  decision based on the agent which matches first to the robots rules.  
+  Also, there is NO need to add a wildcard (ie. *) to this string as the 
+  robots parser would smartly take care of a no-match situation. 
+
+  If no value is specified, by default HTTP agent (ie. 'http.agent.name') 
+  would be used for user agent matching by the robots parser. 
   /description
 /property
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java?rev=1604291r1=1604290r2=1604291view=diff
==
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java Fri 
Jun 20 22:15:43 2014
@@ -255,10 +255,7 @@ public class FetcherJob extends NutchToo
   }
 
   void checkConfiguration() {
-
-// ensure that a value has been set for the agent name and that that
-// agent name is the first value in the agents we advertise for robot
-// rules parsing
+// ensure that a value has been set for the agent name
 String agentName = getConf().get(http.agent.name);
 if (agentName == null || agentName.trim().length() == 0) {
   String message = Fetcher: No agents listed in 'http.agent.name'
@@ -267,23 +264,6 @@ public class FetcherJob extends NutchToo
 LOG.error(message);
   }
   throw new IllegalArgumentException(message);
-} else {
-
-  // get all of the agents that we advertise
-  String agentNames = getConf().get(http.robots.agents);
-  StringTokenizer tok = new StringTokenizer(agentNames, ,);
-  ArrayListString agents = new ArrayListString();
-  while (tok.hasMoreTokens()) {
-agents.add(tok.nextToken().trim());
-  }
-
-  // if the first one is not equal to our agent name, log fatal and throw
-  // an exception
-  if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
-String message = Fetcher: Your 'http.agent.name' value should be 
-+ listed first in 'http.robots.agents' property.;
-LOG.warn(message);
-  }
 }
   }
 

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1604291r1=1604290r2=1604291view=diff
==
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java

svn commit: r1604298 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/util/ branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ branches/2.x/src/plugin/parse-html

2014-06-20 Thread snagel
Author: snagel
Date: Fri Jun 20 22:56:32 2014
New Revision: 1604298

URL: http://svn.apache.org/r1604298
Log:
NUTCH-1767 remove special treatment of params in relative links

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java

nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java

nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java

nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1604298r1=1604297r2=1604298view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Jun 20 22:56:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1767 remove special treatment of params in relative links (snagel)
+
 * NUTCH-1718 redefine http.robots.agent as additional agent names (snagel, 
Tejas Patil, Daniel Kugel)
 
 * NUTCH-1796 Ensure Gora object builders are used as oppose to empty 
constructors (snagel via lewismc)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1604298r1=1604297r2=1604298view=diff
==
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Fri Jun 20 
22:56:32 2014
@@ -28,9 +28,8 @@ import org.apache.nutch.util.domain.Doma
 public class URLUtil {
 
   /**
-   * Resolve relative URL-s and fix a few java.net.URL errors
-   * in handling of URLs with embedded params and pure query
-   * targets.
+   * Resolve relative URL-s and fix a java.net.URL error
+   * in handling of URLs with pure query targets.
* @param base base url
* @param target target url (may be relative)
* @return resolved absolute url.

Modified: 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1604298r1=1604297r2=1604298view=diff
==
--- 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 Fri Jun 20 22:56:32 2014
@@ -298,51 +298,6 @@ public class DOMContentUtils {
   }
   
   /**
-   * Handles cases where the url param information is encoded into the base
-   * url as opposed to the target.
-   * p
-   * If the taget contains params (i.e. ';') information then the target 
-   * params information is assumed to be correct and any base params 
information
-   * is ignored.  If the base contains params information but the tareget does
-   * not, then the params information is moved to the target allowing it to be
-   * correctly determined by the java.net.URL class.
-   * 
-   * @param base The base URL.
-   * @param target The target path from the base URL.
-   * 
-   * @return URL A URL with the params information correctly encoded.
-   * 
-   * @throws MalformedURLException If the url is not a well formed URL.
-   */
-  private URL fixEmbeddedParams(URL base, String target) 
-throws MalformedURLException{
-
-// the target contains params information or the base doesn't then no
-// conversion necessary, return regular URL
-if (target.indexOf(';') = 0 || base.toString().indexOf(';') == -1) {
-  return new URL(base, target);
-}
-
-// get the base url and it params information
-String baseURL = base.toString();
-int startParams = baseURL.indexOf(';');
-String params = baseURL.substring(startParams);
-
-// if the target has a query string then put the params information after
-// any path but before the query string, otherwise just append to the path
-int startQS = target.indexOf('?');
-if (startQS = 0) {
-  target = target.substring(0, startQS) + params + 
-target.substring(startQS);
-}
-else {
-  target += params;
-}
-
-return URLUtil.resolveURL(base, target);
-  }
-
-  /**
* This method finds all anchors below the supplied DOM
* codenode/code, and creates appropriate {@link Outlink}
* records for each (relative

svn commit: r1605204 [3/3] - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/api/ branches/2.x/src/java/org/apache/nutch/api/impl/ branches/2.x/src/java/org/apache/nutch/crawl/ branche

2014-06-24 Thread snagel
Modified: 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1605204r1=1605203r2=1605204view=diff
==
--- 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 Tue Jun 24 21:41:28 2014
@@ -31,7 +31,13 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.conf.Configured;
 import org.apache.oro.text.regex.*;
 
-/** Converts URLs to a normal form . */
+/**
+ * Converts URLs to a normal form:
+ * ul
+ * liremove dot segments in path: code/.//code or code/..//code/li
+ * liremove default ports, e.g. 80 for protocol codehttp:///code/li
+ * /ul
+ */
 public class BasicURLNormalizer extends Configured implements URLNormalizer {
 public static final Logger LOG = 
LoggerFactory.getLogger(BasicURLNormalizer.class);
 

Added: 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java?rev=1605204view=auto
==
--- 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
 (added)
+++ 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
 Tue Jun 24 21:41:28 2014
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL normalizer performing basic normalizations: remove default ports
+ * and dot segments in path.
+ */
+package org.apache.nutch.net.urlnormalizer.basic;

Propchange: 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
--
svn:eol-style = native

Added: 
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java?rev=1605204view=auto
==
--- 
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
 (added)
+++ 
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
 Tue Jun 24 21:41:28 2014
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL normalizer renaming hosts to a canonical form listed in the
+ * configuration file.
+ */
+package org.apache.nutch.net.urlnormalizer.host;

Propchange: 
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
--
svn:eol-style = native

Added: 

svn commit: r1607929 - /nutch/trunk/build.xml

2014-07-04 Thread snagel
Author: snagel
Date: Fri Jul  4 20:15:12 2014
New Revision: 1607929

URL: http://svn.apache.org/r1607929
Log:
add dependency init (calling ivy-init) to compile-core-test to fix 
nightly build failures introduced with NUTCH-1803

Modified:
nutch/trunk/build.xml

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1607929r1=1607928r2=1607929view=diff
==
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Fri Jul  4 20:15:12 2014
@@ -355,7 +355,7 @@
   !-- == --
   !-- Compile test code  -- 
   !-- == --
-  target name=compile-core-test depends=resolve-test, compile-core 
description=-- compile test code
+  target name=compile-core-test depends=init, resolve-test, compile-core 
description=-- compile test code
 javac 
  encoding=${build.encoding} 
  srcdir=${test.src.dir}




svn commit: r1608130 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/util/ branches/2.x/src/test/org/apache/nutch/util/ branches/2.x/src/testresources/test-mime-util/ trunk/ trunk/sr

2014-07-05 Thread snagel
Author: snagel
Date: Sat Jul  5 20:36:33 2014
New Revision: 1608130

URL: http://svn.apache.org/r1608130
Log:
NUTCH-1605 MIME type detector recognizes xlsx as zip file

Added:
nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java   (with 
props)
nutch/branches/2.x/src/testresources/test-mime-util/
nutch/branches/2.x/src/testresources/test-mime-util/test.xlsx   (with props)
nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java   (with props)
nutch/trunk/src/testresources/test-mime-util/
nutch/trunk/src/testresources/test-mime-util/test.xlsx   (with props)
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1608130r1=1608129r2=1608130view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Jul  5 20:36:33 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel)
+
 * NUTCH-385 Improve description of thread related configuration for Fetcher 
(jnioche,lufeng)
 
 * NUTCH-1798 Crawl script not calling index command correctly (Aaron Bedward 
via jnioche)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java?rev=1608130r1=1608129r2=1608130view=diff
==
--- nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java Sat Jul  5 
20:36:33 2014
@@ -19,13 +19,16 @@ package org.apache.nutch.util;
 
 // JDK imports
 import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 
 // Tika imports
 import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
@@ -128,10 +131,10 @@ public final class MimeUtil {
* strategies available within Tika. First, the mime type provided in
* codetypeName/code is cleaned, with {@link #cleanMimeType(String)}.
* Then the cleaned mime type is looked up in the underlying Tika
-   * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} 
is
-   * found, then that mime type is used, otherwise URL resolution is
-   * used to try and determine the mime type. If that means is unsuccessful, 
and
-   * if codemime.type.magic/code is enabled in {@link NutchConfiguration},
+   * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType}
+   * is found, then that mime type is used, otherwise URL resolution is
+   * used to try and determine the mime type. However, if
+   * codemime.type.magic/code is enabled in {@link NutchConfiguration},
* then mime type magic resolution is used to try and obtain a
* better-than-the-default approximation of the {@link MimeType}.
* 
@@ -145,24 +148,19 @@ public final class MimeUtil {
*/
   public String autoResolveContentType(String typeName, String url, byte[] 
data) {
 String retType = null;
-String magicType = null;
 MimeType type = null;
 String cleanedMimeType = null;
 
-try {
-  cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? 
this.mimeTypes
-  .forName(MimeUtil.cleanMimeType(typeName)).getName()
-  : null;
-} catch (MimeTypeException mte) {
-  // Seems to be a malformed mime type name...
-}
-
+cleanedMimeType = MimeUtil.cleanMimeType(typeName);
 // first try to get the type from the cleaned type name
-try {
-  type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
-  : null;
-} catch (MimeTypeException e) {
-  type = null;
+if (cleanedMimeType != null) {
+  try {
+type = mimeTypes.forName(cleanedMimeType);
+cleanedMimeType = type.getName();
+  } catch (MimeTypeException mte) {
+// Seems to be a malformed mime type name...
+cleanedMimeType = null;
+  }
 }
 
 // if returned null, or if it's the default type then try url resolution
@@ -172,8 +170,6 @@ public final class MimeUtil {
   // mime-type, then guess a mime-type from the url pattern
 
   try {
-TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
-Tika tika = new Tika(tikaConfig);
 retType = tika.detect(url) != null ? tika.detect(url) : null;
   } catch (Exception e

svn commit: r1608135 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/src/bin/crawl branches/2.x/src/bin/nutch trunk/CHANGES.txt trunk/src/bin/crawl trunk/src/bin/nutch

2014-07-05 Thread snagel
Author: snagel
Date: Sat Jul  5 21:13:19 2014
New Revision: 1608135

URL: http://svn.apache.org/r1608135
Log:
NUTCH-1566 bin/nutch to allow whitespace in paths

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/bin/crawl
nutch/branches/2.x/src/bin/nutch
nutch/trunk/CHANGES.txt
nutch/trunk/src/bin/crawl
nutch/trunk/src/bin/nutch

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1608135r1=1608134r2=1608135view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Jul  5 21:13:19 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1566 bin/nutch to allow whitespace in paths (tejasp, snagel)
+
 * NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel)
 
 * NUTCH-385 Improve description of thread related configuration for Fetcher 
(jnioche,lufeng)

Modified: nutch/branches/2.x/src/bin/crawl
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1608135r1=1608134r2=1608135view=diff
==
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Sat Jul  5 21:13:19 2014
@@ -70,12 +70,12 @@ timeLimitFetch=180
 addDays=0
 #
 
-bin=`dirname $0`
-bin=`cd $bin; pwd`
+bin=`dirname $0`
+bin=`cd $bin; pwd`
 
 # determines whether mode based on presence of job file
 mode=local
-if [ -f ${bin}/../*nutch*.job ]; then
+if [ -f ${bin}/../*nutch*.job ]; then
 mode=distributed
 fi
 
@@ -92,8 +92,7 @@ if [ $mode = distributed ]; then
 fi
 
 # initial injection
-$bin/nutch inject $SEEDDIR -crawlId $CRAWL_ID
-
+$bin/nutch inject $SEEDDIR -crawlId $CRAWL_ID
 if [ $? -ne 0 ] 
   then exit $? 
 fi
@@ -114,14 +113,14 @@ do
   batchId=`date +%s`-$RANDOM
 
   echo Generating a new fetchlist
-  $bin/nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter 
-adddays $addDays -crawlId $CRAWL_ID -batchId $batchId
+  $bin/nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter 
-adddays $addDays -crawlId $CRAWL_ID -batchId $batchId
   
   if [ $? -ne 0 ] 
   then exit $? 
   fi
 
   echo Fetching : 
-  $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch 
$batchId -crawlId $CRAWL_ID -threads 50
+  $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch 
$batchId -crawlId $CRAWL_ID -threads 50
 
   if [ $? -ne 0 ] 
   then exit $? 
@@ -132,7 +131,7 @@ do
   # enable the skipping of records for the parsing so that a dodgy document 
   # so that it does not fail the full task
   skipRecordsOptions=-D mapred.skip.attempts.to.start.skipping=2 -D 
mapred.skip.map.max.skip.records=1
-  $bin/nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId 
$CRAWL_ID
+  $bin/nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId 
$CRAWL_ID
 
   if [ $? -ne 0 ] 
   then exit $? 
@@ -140,21 +139,21 @@ do
 
   # updatedb with this batch
   echo CrawlDB update for $CRAWL_ID
-  $bin/nutch updatedb $commonOptions $batchId -crawlId $CRAWL_ID
+  $bin/nutch updatedb $commonOptions $batchId -crawlId $CRAWL_ID
 
   if [ $? -ne 0 ] 
   then exit $? 
   fi
 
   echo Indexing $CRAWL_ID on SOLR index - $SOLRURL
-  $bin/nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId 
$CRAWL_ID
+  $bin/nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId 
$CRAWL_ID
   
   if [ $? -ne 0 ] 
then exit $? 
   fi
 
   echo SOLR dedup - $SOLRURL
-  $bin/nutch solrdedup $commonOptions $SOLRURL
+  $bin/nutch solrdedup $commonOptions $SOLRURL
   
   if [ $? -ne 0 ] 
then exit $? 

Modified: nutch/branches/2.x/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/nutch?rev=1608135r1=1608134r2=1608135view=diff
==
--- nutch/branches/2.x/src/bin/nutch (original)
+++ nutch/branches/2.x/src/bin/nutch Sat Jul  5 21:13:19 2014
@@ -25,6 +25,14 @@
 #   Default is 1000.
 #
 #   NUTCH_OPTS  Extra Java runtime options.
+#   Multiple options must be separated by white space.
+#
+#   NUTCH_LOG_DIR   Log directory (default: $NUTCH_HOME/logs)
+#
+#   NUTCH_LOGFILE   Log file (default: hadoop.log)
+#
+#   NUTCH_CONF_DIR  Path(s) to configuration files (default: $NUTCH_HOME/conf).
+#   Multiple paths must be separated by a colon ':'.
 #
 cygwin=false
 case `uname` in
@@ -78,13 +86,13 @@ COMMAND=$1
 shift
 
 # some directories
-THIS_DIR=`dirname $THIS`
-NUTCH_HOME=`cd $THIS_DIR/.. ; pwd`
+THIS_DIR=`dirname $THIS`
+NUTCH_HOME=`cd $THIS_DIR/.. ; pwd`
 
 # some Java parameters
 if [ $NUTCH_JAVA_HOME !=  ]; then
   #echo run java in $NUTCH_JAVA_HOME
-  JAVA_HOME=$NUTCH_JAVA_HOME
+  JAVA_HOME=$NUTCH_JAVA_HOME
 fi
   
 if [ $JAVA_HOME

svn commit: r1608136 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/plugin/ trunk/ trunk/src/java/org/apache/nutch/plugin/

2014-07-05 Thread snagel
Author: snagel
Date: Sat Jul  5 21:42:20 2014
New Revision: 1608136

URL: http://svn.apache.org/r1608136
Log:
NUTCH-1776 Log incorrect plugin.folder file path

Modified:
nutch/branches/2.x/CHANGES.txt

nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1608136r1=1608135r2=1608136view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Jul  5 21:42:20 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1776 Log incorrect plugin.folder file path (Diaa via snagel)
+
 * NUTCH-1566 bin/nutch to allow whitespace in paths (tejasp, snagel)
 
 * NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1608136r1=1608135r2=1608136view=diff
==
--- 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java 
Sat Jul  5 21:42:20 2014
@@ -133,6 +133,9 @@ public class PluginManifestParser {
   } catch (UnsupportedEncodingException e) {
   }
   directory = new File(path);
+} else if (!directory.exists()) {
+  LOG.warn(Plugins: directory not found:  + name);
+  return null;
 }
 return directory;
   }

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1608136r1=1608135r2=1608136view=diff
==
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java 
Sat Jul  5 21:42:20 2014
@@ -71,6 +71,10 @@ public class PluginRepository {
 PluginManifestParser manifestParser = new PluginManifestParser(this.conf, 
this);
 MapString, PluginDescriptor allPlugins = manifestParser
 .parsePluginFolder(pluginFolders);
+if (allPlugins.isEmpty()) {
+  LOG.warn(No plugins found on paths of property plugin.folders=\{}\,
+  conf.get(plugin.folders));
+}
 Pattern excludes = Pattern.compile(conf.get(plugin.excludes, ));
 Pattern includes = Pattern.compile(conf.get(plugin.includes, ));
 MapString, PluginDescriptor filteredPlugins = filter(excludes, includes,

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1608136r1=1608135r2=1608136view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Jul  5 21:42:20 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1776 Log incorrect plugin.folder file path (Diaa via snagel)
+
 * NUTCH-1566 bin/nutch to allow whitespace in paths (tejasp, snagel)
 
 * NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1608136r1=1608135r2=1608136view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java Sat 
Jul  5 21:42:20 2014
@@ -134,6 +134,9 @@ public class PluginManifestParser {
   } catch (UnsupportedEncodingException e) {
   }
   directory = new File(path);
+} else if (!directory.exists()) {
+  LOG.warn(Plugins: directory not found:  + name);
+  return null;
 }
 return directory;
   }

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1608136r1=1608135r2=1608136view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Sat Jul  
5 21:42:20 2014
@@ -74,6 +74,10 @@ public class

svn commit: r1609568 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/src/bin/nutch trunk/CHANGES.txt trunk/src/bin/nutch

2014-07-10 Thread snagel
Author: snagel
Date: Thu Jul 10 20:50:27 2014
New Revision: 1609568

URL: http://svn.apache.org/r1609568
Log:
NUTCH-1811 bin/nutch junit to use junit 4 test runner

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/bin/nutch
nutch/trunk/CHANGES.txt
nutch/trunk/src/bin/nutch

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1609568r1=1609567r2=1609568view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Jul 10 20:50:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1811 bin/nutch junit to use junit 4 test runner (snagel)
+
 * NUTCH-1776 Log incorrect plugin.folder file path (Diaa via snagel)
 
 * NUTCH-1566 bin/nutch to allow whitespace in paths (tejasp, snagel)

Modified: nutch/branches/2.x/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/nutch?rev=1609568r1=1609567r2=1609568view=diff
==
--- nutch/branches/2.x/src/bin/nutch (original)
+++ nutch/branches/2.x/src/bin/nutch Thu Jul 10 20:50:27 2014
@@ -238,7 +238,7 @@ elif [ $COMMAND = nutchserver ] ; th
 CLASS=org.apache.nutch.api.NutchServer
 elif [ $COMMAND = junit ] ; then
   CLASSPATH=$CLASSPATH:$NUTCH_HOME/test/classes/
-  CLASS=junit.textui.TestRunner
+  CLASS=org.junit.runner.JUnitCore
 else
 CLASS=$COMMAND
 fi

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1609568r1=1609567r2=1609568view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul 10 20:50:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1811 bin/nutch junit to use junit 4 test runner (snagel)
+
 * NUTCH-1799 ANT Eclipse task discovers all plugin jars automatically (jnioche)
 
 * NUTCH-578 URL fetched with 403 is generated over and over again (snagel)

Modified: nutch/trunk/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1609568r1=1609567r2=1609568view=diff
==
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Thu Jul 10 20:50:27 2014
@@ -262,7 +262,7 @@ elif [ $COMMAND = plugin ] ; then
   CLASS=org.apache.nutch.plugin.PluginRepository
 elif [ $COMMAND = junit ] ; then
   CLASSPATH=$CLASSPATH:$NUTCH_HOME/test/classes/
-  CLASS=junit.textui.TestRunner
+  CLASS=org.junit.runner.JUnitCore
 else
   CLASS=$COMMAND
 fi




svn commit: r1614375 - in /nutch: branches/2.x/ branches/2.x/conf/ branches/2.x/src/java/org/apache/nutch/indexer/ branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic

2014-07-29 Thread snagel
Author: snagel
Date: Tue Jul 29 15:13:20 2014
New Revision: 1614375

URL: http://svn.apache.org/r1614375
Log:
NUTCH-1708 use same id when indexing and deleting redirects

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/schema.xml

nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
nutch/trunk/CHANGES.txt
nutch/trunk/conf/schema-solr4.xml
nutch/trunk/conf/schema.xml
nutch/trunk/conf/solrindex-mapping.xml
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java

nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1614375r1=1614374r2=1614375view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Jul 29 15:13:20 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1708 use same id when indexing and deleting redirects (snagel)
+
 * NUTCH-1817 Remove pom.xml from source (jnioche)
 
 * NUTCH-1811 bin/nutch junit to use junit 4 test runner (snagel)

Modified: nutch/branches/2.x/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1614375r1=1614374r2=1614375view=diff
==
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Tue Jul 29 15:13:20 2014
@@ -307,7 +307,7 @@
 to include it as performance improvements are minimal. --
 field name=_version_ type=long indexed=true stored=true/
 
-field name=id type=string stored=true indexed=true/
+field name=id type=string stored=true indexed=true 
required=true/
 
 !-- core fields --
 field name=batchId type=string stored=true indexed=false/
@@ -316,7 +316,7 @@
 
 !-- fields for index-basic plugin --
 field name=host type=url stored=false indexed=true/
-field name=url type=url stored=true indexed=true required=true/
+field name=url type=url stored=true indexed=true/
 field name=orig type=url stored=true indexed=true /
 !-- stored=true for highlighting, use term vectors  and positions for 
fast highlighting --
 field name=content type=text_general stored=true indexed=true/

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1614375r1=1614374r2=1614375view=diff
==
--- 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 (original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 Tue Jul 29 15:13:20 2014
@@ -123,6 +123,7 @@ public class IndexingFiltersChecker exte
 }
 
 NutchDocument doc = new NutchDocument();
+doc.add(id, url);
 doc.add(digest, StringUtil.toHexString(page.getSignature()));
 
 try {

Modified: 
nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java?rev=1614375r1=1614374r2=1614375view=diff
==
--- 
nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
 Tue Jul 29 15:13:20 2014
@@ -121,7 +121,7 @@ public class ElasticIndexWriter implemen
 
   @Override
   public void write(NutchDocument doc) throws IOException {
-String id = (String) doc.getFieldValue(url);
+String id = (String) doc.getFieldValue(id);
 String type = doc.getDocumentMeta().get(type);
 if (type == null)
   type = doc;

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1614375r1=1614374r2=1614375view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 29 15:13:20 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1708 use same id when indexing and deleting redirects (snagel)
+
 * NUTCH-1818 Add deps-test-compile task

svn commit: r1618521 - /nutch/cms_site/trunk/content/index.md

2014-08-17 Thread snagel
Author: snagel
Date: Sun Aug 17 20:24:29 2014
New Revision: 1618521

URL: http://svn.apache.org/r1618521
Log:
CMS commit to nutch by snagel

Modified:
nutch/cms_site/trunk/content/index.md

Modified: nutch/cms_site/trunk/content/index.md
URL: 
http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1618521r1=1618520r2=1618521view=diff
==
--- nutch/cms_site/trunk/content/index.md (original)
+++ nutch/cms_site/trunk/content/index.md Sun Aug 17 20:24:29 2014
@@ -120,6 +120,13 @@ under the License. 
   /div
 
div class=jumbotron
+  h231 July 2014 - Nutch tutorial at upcoming a 
href=http://events.linuxfoundation.org/events/apachecon-europe;ApacheCon 
Europe/a in Budapest/h2
+  pThe upcoming a 
href=http://events.linuxfoundation.org/events/apachecon-europe;ApacheCon 
Europe/a in Budapest, November 17 - 21, 2014,
+  will offer a one-day a href=http://sched.co/1pbE15n;Nutch tutorial/a. 
Topics will span from Nutch installation and configuration up to plugin
+  development. Both Nutch 1.x and 2.x are covered. The conference is a good 
opportunity to bring together both users and committers of Nutch and related 
projects./p
+  /div
+
+   div class=jumbotron
   h201 May 2014 - Apache Nutch Participates in a 
href=https://www.google-melange.com/gsoc/homepage/google/gsoc2014;Google 
Summer of Code/a/h2
   a title=ApacheCon US 2009 href=http://www.us.apachecon.com/c/acus2009/;
 img 
src=http://typo3.org/fileadmin/t3org/images/FM-news/2014/thisweek/920x156xbanner-gsoc2014.jpg;
 class=float-right alt=GSoC Logo/




svn commit: r919651 - /websites/production/nutch/content/

2014-08-17 Thread snagel
Author: snagel
Date: Sun Aug 17 20:26:24 2014
New Revision: 919651

Log:
announce tutorial at ApacheCon Europe in Budapest

Added:
websites/production/nutch/content/
  - copied from r919650, websites/staging/nutch/trunk/content/



svn commit: r1619934 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/crawl/ trunk/ trunk/src/java/org/apache/nutch/crawl/

2014-08-22 Thread snagel
Author: snagel
Date: Fri Aug 22 21:23:32 2014
New Revision: 1619934

URL: http://svn.apache.org/r1619934
Log:
NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, 
generate.max.per.host.by.ip

Modified:
nutch/branches/2.x/CHANGES.txt

nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1619934r1=1619933r2=1619934view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Aug 22 21:23:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, 
generate.max.per.host.by.ip (Matthias Agethle via snagel)
+
 * NUTCH-1819 batchId in GeneratorJob ( Fjodor Vershinin via lewismc)
 
 * NUTCH-1708 use same id when indexing and deleting redirects (snagel)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1619934r1=1619933r2=1619934view=diff
==
--- 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java 
Fri Aug 22 21:23:32 2014
@@ -60,12 +60,8 @@ implements FetchSchedule {
   public void setConf(Configuration conf) {
 super.setConf(conf);
 if (conf == null) return;
-int oldDefaultInterval = conf.getInt(db.default.fetch.interval, 0);
 defaultInterval = conf.getInt(db.fetch.interval.default, 0);
-if (oldDefaultInterval  0  defaultInterval == 0) defaultInterval = 
oldDefaultInterval * SECONDS_PER_DAY;
-int oldMaxInterval = conf.getInt(db.max.fetch.interval, 0);
 maxInterval = conf.getInt(db.fetch.interval.max, 0 );
-if (oldMaxInterval  0  maxInterval == 0) maxInterval = oldMaxInterval * 
FetchSchedule.SECONDS_PER_DAY;
 LOG.info(defaultInterval= + defaultInterval);
 LOG.info(maxInterval= + maxInterval);
   }

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1619934r1=1619933r2=1619934view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Aug 22 21:23:32 2014
@@ -2,7 +2,7 @@ Nutch Change Log
 
 Nutch Current Development
 
-* NUTCH-XX
+* NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, 
generate.max.per.host.by.ip (Matthias Agethle via snagel)
 
 Nutch 1.9 Release Change Log - 12/08/2014 (dd/mm/)
 Release Report - http://s.apache.org/1.9-release

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=1619934r1=1619933r2=1619934view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Fri Aug 22 
21:23:32 2014
@@ -50,9 +50,7 @@ public class CrawlDbReducer implements R
 retryMax = job.getInt(db.fetch.retry.max, 3);
 scfilters = new ScoringFilters(job);
 additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
-int oldMaxInterval = job.getInt(db.max.fetch.interval, 0);
 maxInterval = job.getInt(db.fetch.interval.max, 0 );
-if (oldMaxInterval  0  maxInterval == 0) maxInterval = oldMaxInterval * 
FetchSchedule.SECONDS_PER_DAY;
 schedule = FetchScheduleFactory.getFetchSchedule(job);
 int maxLinks = job.getInt(db.update.max.inlinks, 1);
 linked = new InlinkPriorityQueue(maxLinks);

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1619934r1=1619933r2=1619934view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Aug 22 
21:23:32 2014
@@ -74,9 +74,6 @@ public class Generator extends Configure
   public static final String GENERATOR_DELAY = crawl.gen.delay;
   public static final String GENERATOR_MAX_NUM_SEGMENTS = 
generate.max.num.segments;
   
-  // deprecated parameters 
-  public static final String GENERATE_MAX_PER_HOST_BY_IP = 
generate.max.per.host.by.ip;
-
   public static class SelectorEntry implements Writable

svn commit: r1619942 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/crawl/ branches/2.x/src/java/org/apache/nutch/parse/ trunk/ trunk/src/java/org/apache/nutch/crawl/

2014-08-22 Thread snagel
Author: snagel
Date: Fri Aug 22 22:23:27 2014
New Revision: 1619942

URL: http://svn.apache.org/r1619942
Log:
NUTCH-1693 TextMD5Signature computed on textual content

Added:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java   
(with props)
nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java   (with 
props)
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
nutch/trunk/CHANGES.txt

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1619942r1=1619941r2=1619942view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Aug 22 22:23:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, 
markus via snagel)
+
 * NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, 
generate.max.per.host.by.ip (Matthias Agethle via snagel)
 
 * NUTCH-1819 batchId in GeneratorJob ( Fjodor Vershinin via lewismc)

Added: nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1619942view=auto
==
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java 
(added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java 
Fri Aug 22 22:23:27 2014
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.nutch.storage.WebPage;
+
+/**
+ * Default implementation of a page signature. It calculates an MD5 hash of the
+ * textual content of a page. In case there is no text, it calculates a hash
+ * from the page's fetched content.
+ */
+public class TextMD5Signature extends Signature {
+
+  private final static CollectionWebPage.Field FIELDS = new 
HashSetWebPage.Field();
+
+  static {
+FIELDS.add(WebPage.Field.TEXT);
+  }
+
+  Signature fallback = new MD5Signature();
+
+  @Override
+  public byte[] calculate(WebPage page) {
+CharSequence text = page.getText();
+
+if (text == null || text.length() == 0) {
+  return fallback.calculate(page);
+}
+
+return MD5Hash.digest(text.toString()).getDigest();
+  }
+
+  @Override
+  public CollectionWebPage.Field getFields() {
+CollectionWebPage.Field fields = new HashSetWebPage.Field(FIELDS);
+fields.addAll(fallback.getFields());
+return fields;
+  }
+}

Propchange: 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java
--
svn:eol-style = native

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1619942r1=1619941r2=1619942view=diff
==
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Fri Aug 
22 22:23:27 2014
@@ -187,8 +187,6 @@ public class ParseUtil extends Configure
   return;
 }
 
-final byte[] signature = sig.calculate(page);
-
 org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus();
 page.setParseStatus(pstatus);
 if (ParseStatusUtils.isSuccess(pstatus)) {
@@ -233,6 +231,7 @@ public class ParseUtil extends Configure
 if (prevSig != null) {
   page.setPrevSignature(prevSig);
 }
+final byte[] signature = sig.calculate(page);
 page.setSignature(ByteBuffer.wrap(signature));
 if (page.getOutlinks() != null) {
   page.getOutlinks().clear();

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev

svn commit: r1619944 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFilter.java

2014-08-22 Thread snagel
Author: snagel
Date: Fri Aug 22 22:28:12 2014
New Revision: 1619944

URL: http://svn.apache.org/r1619944
Log:
NUTCH-1775 IndexingFilter: document origin of passed CrawlDatum

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1619944r1=1619943r2=1619944view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Aug 22 22:28:12 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1775 IndexingFilter: document origin of passed CrawlDatum (snagel)
+
 * NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, 
markus via snagel)
 
 * NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, 
generate.max.per.host.by.ip (Matthias Agethle via snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=1619944r1=1619943r2=1619944view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Fri Aug 
22 22:28:12 2014
@@ -39,15 +39,22 @@ public interface IndexingFilter extends 
 
   /**
* Adds fields or otherwise modifies the document that will be indexed for a
-   * parse. Unwanted documents can be removed from indexing by returning a 
null value.
+   * parse. Unwanted documents can be removed from indexing by returning a null
+   * value.
* 
-   * @param doc document instance for collecting fields
-   * @param parse parse data instance
-   * @param url page url
-   * @param datum crawl datum for the page
-   * @param inlinks page inlinks
-   * @return modified (or a new) document instance, or null (meaning the 
document
-   * should be discarded)
+   * @param doc
+   *  document instance for collecting fields
+   * @param parse
+   *  parse data instance
+   * @param url
+   *  page url
+   * @param datum
+   *  crawl datum for the page (fetch datum from segment containing
+   *  fetch status and fetch time)
+   * @param inlinks
+   *  page inlinks
+   * @return modified (or a new) document instance, or null (meaning the
+   * document should be discarded)
* @throws IndexingException
*/
   NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum 
datum, Inlinks inlinks)




svn commit: r1625821 - in /nutch/cms_site/trunk/content/apidocs/apidocs-1.9: ./ org/ org/apache/ org/apache/nutch/ org/apache/nutch/analysis/ org/apache/nutch/analysis/lang/ org/apache/nutch/analysis/

2014-09-17 Thread snagel
Author: snagel
Date: Wed Sep 17 20:52:17 2014
New Revision: 1625821

URL: http://svn.apache.org/r1625821
Log:
add 1.9 Java apidocs


[This commit notification would consist of 137 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]


svn commit: r1625826 - /nutch/cms_site/trunk/content/javadoc.md

2014-09-17 Thread snagel
Author: snagel
Date: Wed Sep 17 21:07:29 2014
New Revision: 1625826

URL: http://svn.apache.org/r1625826
Log:
add apidoc 1.9

Modified:
nutch/cms_site/trunk/content/javadoc.md

Modified: nutch/cms_site/trunk/content/javadoc.md
URL: 
http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/javadoc.md?rev=1625826r1=1625825r2=1625826view=diff
==
--- nutch/cms_site/trunk/content/javadoc.md (original)
+++ nutch/cms_site/trunk/content/javadoc.md Wed Sep 17 21:07:29 2014
@@ -41,7 +41,7 @@ under the License. 
h2Current Releases Javadoc/h2
ul
 lia href=./apidocs/apidocs-2.2.1/index.html2.2.1 (2.X 
Branch)/a/li
-lia href=./apidocs/apidocs-1.8/index.html1.8 (1.X branch)/a/li
+lia href=./apidocs/apidocs-1.9/index.html1.9 (1.X branch)/a/li
/ul
   /div
  /section
@@ -55,6 +55,7 @@ under the License. 
  lia href=./apidocs/apidocs-2.2/index.html2.2/a/li
  lia href=./apidocs/apidocs-2.1/index.html2.1/a/li
  lia href=./apidocs/apidocs-2.0/index.html2.0/a/li
+ lia href=./apidocs/apidocs-1.8/index.html1.8/a/li
  lia href=./apidocs/apidocs-1.7/index.html1.7/a/li
  lia href=./apidocs/apidocs-1.6/index.html1.6/a/li
  lia href=./apidocs/apidocs-1.5/index.html1.5/a/li




svn commit: r922601 - /websites/production/nutch/content/

2014-09-17 Thread snagel
Author: snagel
Date: Wed Sep 17 21:08:05 2014
New Revision: 922601

Log:
add Java apidoc 1.9

Added:
websites/production/nutch/content/
  - copied from r922599, websites/staging/nutch/trunk/content/



svn commit: r922608 - /websites/production/nutch/content/

2014-09-17 Thread snagel
Author: snagel
Date: Wed Sep 17 21:32:43 2014
New Revision: 922608

Log:
update Java apidoc 1.9

Added:
websites/production/nutch/content/
  - copied from r922607, websites/staging/nutch/trunk/content/



svn commit: r1626581 - in /nutch: branches/2.x/KEYS branches/2.x/ivy/mvn.template trunk/KEYS trunk/ivy/mvn.template

2014-09-21 Thread snagel
Author: snagel
Date: Sun Sep 21 14:18:26 2014
New Revision: 1626581

URL: http://svn.apache.org/r1626581
Log:
add committer snagel

Modified:
nutch/branches/2.x/KEYS
nutch/branches/2.x/ivy/mvn.template
nutch/trunk/KEYS
nutch/trunk/ivy/mvn.template

Modified: nutch/branches/2.x/KEYS
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/KEYS?rev=1626581r1=1626580r2=1626581view=diff
==
--- nutch/branches/2.x/KEYS (original)
+++ nutch/branches/2.x/KEYS Sun Sep 21 14:18:26 2014
@@ -391,3 +391,39 @@ mig+aEkS1Y2uDm5fXNPICB/eKaZ02rVt1hzA5acj
 lTN6fp142rTJsWIbmePnY+jJhrGpAtdYx+2hFH0c5CPQ
 =cK1G
 -END PGP PUBLIC KEY BLOCK-
+pub   2048R/DB0A9C6D 2012-11-06
+uid  Sebastian Nagel sna...@apache.org
+sig 3DB0A9C6D 2012-11-06  Sebastian Nagel sna...@apache.org
+sub   2048R/9E631E2F 2012-11-06
+sig  DB0A9C6D 2012-11-06  Sebastian Nagel sna...@apache.org
+
+-BEGIN PGP PUBLIC KEY BLOCK-
+Version: GnuPG v1
+
+mQENBFCY+9kBCADcgOPlGGBOPuldasyoEXoJwt8ACbRqZbeIQhS3YIIPFFRTJvns
+GlMAGyiSbMRkfi2uFz1g3u0uiMatD3CEHgV4wqu1d/ALmUKhGJ4VluEzjyRtRgUk
+OD0Xw52Rcp27GNdAwpEojWSy764PbGotNsRvqehYnu+iVBVpxRc//vfPjq/nt5xk
+BwTsR/o/EKulvFB6B1x/bySNZuJZksdpEOxA6s30Ig25nXA//9DrU3Vse40cz1LI
+wx2rCsqU15SxAabqXMdNeQD7gmlE+toPp+ziu3drX8U6iEYwC71RCnc8LtxXt/Aq
+cSnzlAmUmKNZpHM8AqKbW+IVH8iXN/LtKQLxABEBAAG0I1NlYmFzdGlhbiBOYWdl
+bCA8c25hZ2VsQGFwYWNoZS5vcmc+iQE4BBMBAgAiBQJQmPvZAhsDBgsJCAcDAgYV
+CAIJCgsEFgIDAQIeAQIXgAAKCRDGbqe32wqcbWpDCAC5r0Wy1vZ5luLR2IhqDH9a
+q7FLqpLL66LUBPX3mof9hTV3456uraozdmVyO39X3IvXlrQavt2ubreWxMBT+g2z
+hBiQHy5RkjCqYXXz5Gkkxv4rggjcHgwKZDN2gK0VP+7rUqg0JPW9hQPcEdlAqBi0
+i9knjoAwKEpypvo202KWF6JAeUiEOAXAdhcm98uzJN17GRY/N9+3ELbBy6zgbQpl
+6GYNNyS+vW+40aKThrOe7lvjBERN6v01yZ3QjQBfFkaefNjnXOaiR+JkQ5m99DBB
+vbGcQPGx5xpGGGjMbbfB4LT57/F0FxEzO+PgNph2vzEwM8+4BWMaEhxBGdvwuNW/
+uQENBFCY+9kBCAC+LI5lWBrOdCCbd62q4sIjqyJEMzFjozd73aDc3xA1dCrsIz02
++eg0LSmvt3DdPoDTMVLXs4GYM48U139Qy4o4T9gRNhFS+Im+OnI9CLKX29qy7hEu
+xxa1ByA5pZgJ+21wmCBjTVK9Jcf1JeDcAr2L3qVFY2+Hhvh3eOlNov9NzQJpmOv3
+9JRiia4Xm0h9AffL9P+AKlGRuCPfnsPu5JzEuo/wqQ+oIunYJFiCNbU/CaP1PK0x
+S0taB8fsYu/UBh1+bzc0xQvHWYmupqc62qGk8N+useZiKn+4BBvhTU6fykCrZTVH
+jGNi9qpPRy6bPpy9yknVRJDkrUGHiq8VqGA1ABEBAAGJAR8EGAECAAkFAlCY+9kC
+GwwACgkQxm6nt9sKnG1r2ggAxCjb8IoAjVddbEduTkWCFkqtpCjGFj/J1TetooqI
+SvKBeRDZyJ+kA10BIGg6DudYCApo7ObZN6EhxwhVDuPa7nxacrKVgJyhztmFDT8X
+zfhxQ8ytVWayHnvesmwolquIQtqRPfIvB/AwGZ9PjfJFMC6A229tTBhAgva4h8GE
+EEE6JEV4AIQRAcoisr4chzq/9xm19TEjYMvtE92QBiTYhu7uTfUQbnyP3uN4bLEm
+xY2l8d0700NQh27drc5An1wWeYZj+4HrFhnOXktODwi+8W3WNOGr71L1XPttW+G5
+ZnAbVvXpIOwmMCrU7YaCahFry/H+I7G+gWZ4mvujVMgoqw==
+=RlMy
+-END PGP PUBLIC KEY BLOCK-

Modified: nutch/branches/2.x/ivy/mvn.template
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/mvn.template?rev=1626581r1=1626580r2=1626581view=diff
==
--- nutch/branches/2.x/ivy/mvn.template (original)
+++ nutch/branches/2.x/ivy/mvn.template Sun Sep 21 14:18:26 2014
@@ -78,6 +78,11 @@
 nameTalat Uyarer/name
 emailta...@apache.org/email
 /developer
+   developer
+   idsnagel/id
+   nameSebastian Nagel/name
+   emailsna...@apache.org/email
+   /developer
/developers
 build
   sourceDirectorysrc/java/sourceDirectory

Modified: nutch/trunk/KEYS
URL: 
http://svn.apache.org/viewvc/nutch/trunk/KEYS?rev=1626581r1=1626580r2=1626581view=diff
==
--- nutch/trunk/KEYS (original)
+++ nutch/trunk/KEYS Sun Sep 21 14:18:26 2014
@@ -299,3 +299,39 @@ NPgXHx4ASqesjF/9GUrAQfOmXqHdOF6xOb7YYGss
 U3Wt+q9F6O+RmemV6a6mrpog+Aq+BkIMWCJ8
 =xHbT
 -END PGP PUBLIC KEY BLOCK-
+pub   2048R/DB0A9C6D 2012-11-06
+uid  Sebastian Nagel sna...@apache.org
+sig 3DB0A9C6D 2012-11-06  Sebastian Nagel sna...@apache.org
+sub   2048R/9E631E2F 2012-11-06
+sig  DB0A9C6D 2012-11-06  Sebastian Nagel sna...@apache.org
+
+-BEGIN PGP PUBLIC KEY BLOCK-
+Version: GnuPG v1
+
+mQENBFCY+9kBCADcgOPlGGBOPuldasyoEXoJwt8ACbRqZbeIQhS3YIIPFFRTJvns
+GlMAGyiSbMRkfi2uFz1g3u0uiMatD3CEHgV4wqu1d/ALmUKhGJ4VluEzjyRtRgUk
+OD0Xw52Rcp27GNdAwpEojWSy764PbGotNsRvqehYnu+iVBVpxRc//vfPjq/nt5xk
+BwTsR/o/EKulvFB6B1x/bySNZuJZksdpEOxA6s30Ig25nXA//9DrU3Vse40cz1LI
+wx2rCsqU15SxAabqXMdNeQD7gmlE+toPp+ziu3drX8U6iEYwC71RCnc8LtxXt/Aq
+cSnzlAmUmKNZpHM8AqKbW+IVH8iXN/LtKQLxABEBAAG0I1NlYmFzdGlhbiBOYWdl
+bCA8c25hZ2VsQGFwYWNoZS5vcmc+iQE4BBMBAgAiBQJQmPvZAhsDBgsJCAcDAgYV
+CAIJCgsEFgIDAQIeAQIXgAAKCRDGbqe32wqcbWpDCAC5r0Wy1vZ5luLR2IhqDH9a
+q7FLqpLL66LUBPX3mof9hTV3456uraozdmVyO39X3IvXlrQavt2ubreWxMBT+g2z
+hBiQHy5RkjCqYXXz5Gkkxv4rggjcHgwKZDN2gK0VP+7rUqg0JPW9hQPcEdlAqBi0
+i9knjoAwKEpypvo202KWF6JAeUiEOAXAdhcm98uzJN17GRY/N9+3ELbBy6zgbQpl
+6GYNNyS+vW

svn commit: r1629076 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

2014-10-02 Thread snagel
Author: snagel
Date: Thu Oct  2 21:37:04 2014
New Revision: 1629076

URL: http://svn.apache.org/r1629076
Log:
NUTCH-1826 indexchecker fails if solr.server.url not configured

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1629076r1=1629075r2=1629076view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Oct  2 21:37:04 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1826, NUTCH-1864 indexchecker fails if solr.server.url not configured 
(lewismc, snagel)
+
 * NUTCH-1866 ant eclipse target should not delete runtime (nimafl vai lewismc)
 
 * NUTCH-1857 readb -dump -format csv should use comma (lewismc)

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1629076r1=1629075r2=1629076view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Thu Oct  2 21:37:04 2014
@@ -95,8 +95,6 @@ public class IndexingFiltersChecker exte
 
 ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
 
-IndexWriters writers = new IndexWriters(getConf());
-
 if (!output.getStatus().isSuccess()) {
   System.out.println(Fetch failed with protocol status:  + 
output.getStatus());
   return 0;
@@ -166,9 +164,10 @@ public class IndexingFiltersChecker exte
 }
 
 if (conf.getBoolean(doIndex, false)  doc!=null){
-   writers.open(new JobConf(getConf()), IndexingFilterChecker);
-   writers.write(doc);
-   writers.close();
+  IndexWriters writers = new IndexWriters(getConf());
+  writers.open(new JobConf(getConf()), IndexingFilterChecker);
+  writers.write(doc);
+  writers.close();
 }
 
 return 0;




svn commit: r1630565 - in /nutch/trunk: ./ src/plugin/ src/plugin/protocol-http/ src/plugin/protocol-http/jsp/ src/plugin/protocol-http/src/test/conf/ src/plugin/protocol-http/src/test/org/apache/nutc

2014-10-09 Thread snagel
Author: snagel
Date: Thu Oct  9 19:20:51 2014
New Revision: 1630565

URL: http://svn.apache.org/r1630565
Log:
NUTCH-1164 JUnit tests for protocol-http

Added:
nutch/trunk/src/plugin/protocol-http/jsp/
nutch/trunk/src/plugin/protocol-http/jsp/basic-http.jsp   (with props)
nutch/trunk/src/plugin/protocol-http/jsp/brokenpage.jsp   (with props)
nutch/trunk/src/plugin/protocol-http/jsp/redirect301.jsp   (with props)
nutch/trunk/src/plugin/protocol-http/jsp/redirect302.jsp   (with props)
nutch/trunk/src/plugin/protocol-http/src/test/conf/
nutch/trunk/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml   
(with props)

nutch/trunk/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
   (with props)
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/src/plugin/build.xml
nutch/trunk/src/plugin/protocol-http/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1630565r1=1630564r2=1630565view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Oct  9 19:20:51 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1164 Write JUnit tests for protocol-http (nimafl via snagel)
+
 * NUTCH-1868 Document and improve CLI for FileDumper tool (lewismc)
 
 * NUTCH-1869 Add a flag to -mimeType fiag to FileDumper (lewismc)
@@ -10,7 +12,7 @@ Nutch Current Development 1.10-SNAPSHOT
 
 * NUTCH-1826, NUTCH-1864 indexchecker fails if solr.server.url not configured 
(lewismc, snagel)
 
-* NUTCH-1866 ant eclipse target should not delete runtime (nimafl vai lewismc)
+* NUTCH-1866 ant eclipse target should not delete runtime (nimafl via lewismc)
 
 * NUTCH-1857 readb -dump -format csv should use comma (lewismc)
 

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1630565r1=1630564r2=1630565view=diff
==
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Thu Oct  9 19:20:51 2014
@@ -992,7 +992,7 @@
 source path=${plugins.dir}/protocol-httpclient/src/java/ /
 source path=${plugins.dir}/protocol-httpclient/src/test/ /
 source path=${plugins.dir}/protocol-http/src/java/ /
-!-- source path=${plugins.dir}/protocol-http/src/test/ / -- 
+source path=${plugins.dir}/protocol-http/src/test/ /
 source path=${plugins.dir}/scoring-depth/src/java/ /
 source path=${plugins.dir}/scoring-link/src/java/ /
 source path=${plugins.dir}/scoring-opic/src/java/ /

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1630565r1=1630564r2=1630565view=diff
==
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Oct  9 19:20:51 2014
@@ -88,6 +88,7 @@
  ant dir=language-identifier target=test/
  ant dir=lib-http target=test/
  ant dir=protocol-file target=test/
+ ant dir=protocol-http target=test/
  ant dir=protocol-httpclient target=test/
  !--ant dir=parse-ext target=test/--
  ant dir=feed target=test/

Modified: nutch/trunk/src/plugin/protocol-http/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/build.xml?rev=1630565r1=1630564r2=1630565view=diff
==
--- nutch/trunk/src/plugin/protocol-http/build.xml (original)
+++ nutch/trunk/src/plugin/protocol-http/build.xml Thu Oct  9 19:20:51 2014
@@ -29,12 +29,22 @@
 fileset dir=${nutch.root}/build
   include name=**/lib-http/*.jar /
 /fileset
+pathelement location=${build.dir}/test/conf/
   /path
 
   !-- Deploy Unit test dependencies --
   target name=deps-test
 ant target=deploy inheritall=false dir=../lib-http/
 ant target=deploy inheritall=false dir=../nutch-extensionpoints/
+copy toDir=${build.test}
+  fileset dir=${src.test} excludes=**/*.java/
+/copy
   /target
 
+  !-- for junit test --
+  mkdir dir=${build.test}/data /
+  copy todir=${build.test}/data
+  fileset dir=jsp/
+   /copy
+
 /project

Added: nutch/trunk/src/plugin/protocol-http/jsp/basic-http.jsp
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/jsp/basic-http.jsp?rev=1630565view=auto
==
--- nutch/trunk/src/plugin/protocol-http/jsp/basic-http.jsp (added)
+++ nutch/trunk/src/plugin/protocol-http/jsp/basic-http.jsp Thu Oct  9 19:20:51 
2014
@@ -0,0 +1,44 @@
+%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional

svn commit: r1633222 - in /nutch/branches/2.x: ./ conf/ src/java/org/apache/nutch/parse/ src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/ src/plugin/parse-html/src/java/org/apache

2014-10-20 Thread snagel
Author: snagel
Date: Mon Oct 20 20:44:00 2014
New Revision: 1633222

URL: http://svn.apache.org/r1633222
Log:
NUTCH-1827 Port issues 1467 and 1561 to 2.x

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java

nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java

nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java

nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java

nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java

nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1633222r1=1633221r2=1633222view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Oct 20 20:44:00 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.3-SNAPSHOT
 
+* NUTCH-1827 Port NUTCH-1467 and NUTCH-1561 to 2.x (snagel)
+
 * NUTCH-1876 Upgrade to Crawler Commons 0.5 (jnioche)
 
 * NUTCH-1866 ant eclipse target should not delete runtime (nimafl via lewismc)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1633222r1=1633221r2=1633222view=diff
==
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Mon Oct 20 20:44:00 2014
@@ -1125,7 +1125,7 @@
   description
   Comma-separated list of keys to be taken from the metadata to generate 
fields.
   Can be used e.g. for 'description' or 'keywords' provided that these values 
are generated
-  by a parser (see parse-metatags plugin)  
+  by a parser (see parse-metatags plugin), and property 'metatags.names'.
   /description
 /property
 
@@ -1133,11 +1133,12 @@
 property
   namemetatags.names/name
   value*/value
-  description Names of the metatags to extract, separated by ';'. 
-  Use '*' to extract all metatags. Prefixes the names with 'meta_'
-  in the parse-metadata. For instance to index description and keywords, 
-  you need to activate the plugin index-metadata and set the value of the 
-  parameter 'index.metadata' to 'meta_description;meta_keywords'.
+  descriptionNames of the metatags to extract, separated by ','.
+  Use '*' to extract all metatags. Prefixes the names with 'meta_' in
+  the parse-metadata. For instance, to index description and keywords,
+  you need to activate the plugins parse-metadata and index-metadata
+  and set the value of the properties 'metatags.names' and
+  'index.metadata' to 'description,keywords'.
   /description
 /property
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=1633222r1=1633221r2=1633222view=diff
==
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java Mon 
Oct 20 20:44:00 2014
@@ -21,6 +21,8 @@ import java.net.URL;
 import java.util.Iterator;
 import java.util.Properties;
 
+import org.apache.nutch.metadata.Metadata;
+
 /**
  * This class holds the information about HTML meta tags extracted from 
  * a page. Some special tags have convenience methods for easy checking.
@@ -40,7 +42,7 @@ public class HTMLMetaTags {
 
   private URL refreshHref = null;
 
-  private Properties generalTags = new Properties();
+  private Metadata generalTags = new Metadata();
 
   private Properties httpEquivTags = new Properties();
 
@@ -166,7 +168,7 @@ public class HTMLMetaTags {
* Returns all collected values of the general meta tags. Property names are
* tag names, property values are content values.
*/
-  public Properties getGeneralTags() {
+  public Metadata getGeneralTags() {
 return generalTags;
   }
 
@@ -188,13 +190,13 @@ public class HTMLMetaTags {
 + , refreshHref= + refreshHref + \n
 );
 sb.append( * general tags:\n);
-Iterator? it = generalTags.keySet().iterator();
-while (it.hasNext()) {
-  String key = (String)it.next();
+String[] names = generalTags.names();
+for (String name : names) {
+  String key = name;
   sb.append(   -  + key + \t=\t + generalTags.get(key) + \n);
 }
 sb.append( * http-equiv tags:\n);
-it = httpEquivTags.keySet().iterator();
+IteratorObject it = httpEquivTags.keySet().iterator();
 while

svn commit: r1633426 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/build.xml trunk/CHANGES.txt trunk/build.xml

2014-10-21 Thread snagel
Author: snagel
Date: Tue Oct 21 17:52:27 2014
New Revision: 1633426

URL: http://svn.apache.org/r1633426
Log:
NUTCH-1882 ant eclipse target to add output path to src/test

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/build.xml
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1633426r1=1633425r2=1633426view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Oct 21 17:52:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.3-SNAPSHOT
 
+* NUTCH-1882 ant eclipse target to add output path to src/test (snagel)
+
 * NUTCH-1827 Port NUTCH-1467 and NUTCH-1561 to 2.x (snagel)
 
 * NUTCH-1876 Upgrade to Crawler Commons 0.5 (jnioche)

Modified: nutch/branches/2.x/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1633426r1=1633425r2=1633426view=diff
==
--- nutch/branches/2.x/build.xml (original)
+++ nutch/branches/2.x/build.xml Tue Oct 21 17:52:27 2014
@@ -954,7 +954,7 @@
 library pathref=eclipse.classpath exported=false /

 source path=${basedir}/src/java/ /
-source path=${basedir}/src/test/ /
+source path=${basedir}/src/test/ output=build/test/classes /
 source path=${basedir}/src/plugin/creativecommons/src/java/ /
 source path=${basedir}/src/plugin/creativecommons/src/test/ /
 !-- feed is currently disabled 

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1633426r1=1633425r2=1633426view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct 21 17:52:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1882 ant eclipse target to add output path to src/test (snagel)
+
 * NUTCH-1876 Upgrade to Crawler Commons 0.5 (jnioche)
 
 * NUTCH-1874 FileDumper comment typos ( Arthur Cinader via lewismc)

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1633426r1=1633425r2=1633426view=diff
==
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Tue Oct 21 17:52:27 2014
@@ -947,7 +947,7 @@
 library pathref=eclipse.classpath exported=false /
 
 source path=${basedir}/src/java/ /
-source path=${basedir}/src/test/ /
+source path=${basedir}/src/test/ output=build/test/classes /
 
 source path=${plugins.dir}/creativecommons/src/java/ /
 source path=${plugins.dir}/creativecommons/src/test/ /




svn commit: r1634694 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/src/bin/crawl trunk/CHANGES.txt trunk/src/bin/crawl

2014-10-27 Thread snagel
Author: snagel
Date: Mon Oct 27 21:38:50 2014
New Revision: 1634694

URL: http://svn.apache.org/r1634694
Log:
NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/bin/crawl
nutch/trunk/CHANGES.txt
nutch/trunk/src/bin/crawl

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1634694r1=1634693r2=1634694view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Oct 27 21:38:50 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.3-SNAPSHOT
 
+* NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value 
(snagel)
+
 * NUTCH-1882 ant eclipse target to add output path to src/test (snagel)
 
 * NUTCH-1827 Port NUTCH-1467 and NUTCH-1561 to 2.x (snagel)

Modified: nutch/branches/2.x/src/bin/crawl
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1634694r1=1634693r2=1634694view=diff
==
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Mon Oct 27 21:38:50 2014
@@ -30,7 +30,7 @@ elif [ $# -eq 4 ]; then
  LIMIT=$4
 else
 echo Unknown # of arguments $#
-echo Usage: crawl seedDir crawlDir [solrUrl] numberOfRounds
+echo Usage: crawl seedDir crawlID [solrUrl] numberOfRounds
 exit -1;
 fi
 
@@ -40,7 +40,7 @@ if [ $SEEDDIR =  ]; then
 fi
 
 if [ $CRAWL_ID =  ]; then
-echo Missing crawlDir : crawl seedDir crawlID [solrURL] 
numberOfRounds
+echo Missing crawlID : crawl seedDir crawlID [solrURL] 
numberOfRounds
 exit -1;
 fi
 
@@ -98,16 +98,30 @@ if [ $mode = distributed ]; then
  fi
 fi
 
-# initial injection
-$bin/nutch inject $SEEDDIR -crawlId $CRAWL_ID
-RETCODE=$?
 
-if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-fi
+function __bin_nutch {
+# run $bin/nutch, exit if exit value indicates error
+
+echo $bin/nutch $@ ;# echo command and arguments
+$bin/nutch $@
+
+RETCODE=$?
+if [ $RETCODE -ne 0 ]
+then
+echo Error running:
+echo   $bin/nutch $@
+echo Failed with exit value $RETCODE.
+exit $RETCODE
+fi
+}
 
 
 
+# initial injection
+echo Injecting seed URLs
+__bin_nutch inject $SEEDDIR -crawlId $CRAWL_ID
+
+
 # main loop : rounds of generate - fetch - parse - update
 for ((a=1; a = LIMIT ; a++))
 do
@@ -123,58 +137,28 @@ do
   batchId=`date +%s`-$RANDOM
 
   echo Generating a new fetchlist
-  $bin/nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter 
-adddays $addDays -crawlId $CRAWL_ID -batchId $batchId
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-then exit $RETCODE 
-  fi
+  __bin_nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter 
-adddays $addDays -crawlId $CRAWL_ID -batchId $batchId
 
   echo Fetching : 
-  $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch 
$batchId -crawlId $CRAWL_ID -threads 50
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-then exit $RETCODE 
-  fi
+  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch 
$batchId -crawlId $CRAWL_ID -threads 50
 
   # parsing the batch
   echo Parsing : 
   # enable the skipping of records for the parsing so that a dodgy document 
   # so that it does not fail the full task
   skipRecordsOptions=-D mapred.skip.attempts.to.start.skipping=2 -D 
mapred.skip.map.max.skip.records=1
-  $bin/nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId 
$CRAWL_ID
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-then exit $RETCODE 
-  fi
+  __bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId 
$CRAWL_ID
 
   # updatedb with this batch
   echo CrawlDB update for $CRAWL_ID
-  $bin/nutch updatedb $commonOptions $batchId -crawlId $CRAWL_ID
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-then exit $RETCODE 
-  fi
+  __bin_nutch updatedb $commonOptions $batchId -crawlId $CRAWL_ID
 
   if [ -n $SOLRURL ]; then
 echo Indexing $CRAWL_ID on SOLR index - $SOLRURL
-$bin/nutch index $commonOptions -D solr.server.url=$SOLRURL -all 
-crawlId $CRAWL_ID
-RETCODE=$?
-
-if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-fi
+__bin_nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId 
$CRAWL_ID
 
 echo SOLR dedup - $SOLRURL
-$bin/nutch solrdedup $commonOptions $SOLRURL
-RETCODE=$?
-
-if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-fi
+__bin_nutch solrdedup $commonOptions $SOLRURL
   else
   echo Skipping indexing tasks: no SOLR url provided.
   fi

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1634694r1=1634693r2=1634694view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk

svn commit: r1638203 - in /nutch: branches/2.x/src/bin/crawl trunk/src/bin/crawl

2014-11-11 Thread snagel
Author: snagel
Date: Tue Nov 11 16:20:01 2014
New Revision: 1638203

URL: http://svn.apache.org/r1638203
Log:
NUTCH-1883 in case of generate: break loop and do not exit with error

Modified:
nutch/branches/2.x/src/bin/crawl
nutch/trunk/src/bin/crawl

Modified: nutch/branches/2.x/src/bin/crawl
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1638203r1=1638202r2=1638203view=diff
==
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Tue Nov 11 16:20:01 2014
@@ -137,7 +137,22 @@ do
   batchId=`date +%s`-$RANDOM
 
   echo Generating a new fetchlist
-  __bin_nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter 
-adddays $addDays -crawlId $CRAWL_ID -batchId $batchId
+  generate_args=($commonOptions -topN $sizeFetchlist -noNorm -noFilter 
-adddays $addDays -crawlId $CRAWL_ID -batchId $batchId)
+  echo $bin/nutch generate ${generate_args[@]}
+  $bin/nutch generate ${generate_args[@]}
+  RETCODE=$?
+  if [ $RETCODE -eq 0 ]; then
+  : # ok: no error
+  elif [ $RETCODE -eq 1 ]; then
+echo Generate returned 1 (no new segments created)
+echo Escaping loop: no more URLs to fetch now
+break
+  else
+echo Error running:
+echo   $bin/nutch generate ${generate_args[@]}
+echo Failed with exit value $RETCODE.
+exit $RETCODE
+  fi
 
   echo Fetching : 
   __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch 
$batchId -crawlId $CRAWL_ID -threads 50

Modified: nutch/trunk/src/bin/crawl
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1638203r1=1638202r2=1638203view=diff
==
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Tue Nov 11 16:20:01 2014
@@ -133,7 +133,22 @@ do
   echo `date` : Iteration $a of $LIMIT
 
   echo Generating a new segment
-  __bin_nutch generate $commonOptions $CRAWL_PATH/crawldb 
$CRAWL_PATH/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter
+  generate_args=($commonOptions $CRAWL_PATH/crawldb $CRAWL_PATH/segments 
-topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
+  echo $bin/nutch generate ${generate_args[@]}
+  $bin/nutch generate ${generate_args[@]}
+  RETCODE=$?
+  if [ $RETCODE -eq 0 ]; then
+  : # ok: no error
+  elif [ $RETCODE -eq 1 ]; then
+echo Generate returned 1 (no new segments created)
+echo Escaping loop: no more URLs to fetch now
+break
+  else
+echo Error running:
+echo   $bin/nutch generate ${generate_args[@]}
+echo Failed with exit value $RETCODE.
+exit $RETCODE
+  fi
 
   # capture the name of the segment
   # call hadoop in distributed mode
@@ -168,7 +183,7 @@ do
   __bin_nutch invertlinks $CRAWL_PATH/linkdb $CRAWL_PATH/segments/$SEGMENT
 
   echo Dedup on crawldb
-  $bin/nutch dedup $CRAWL_PATH/crawldb
+  __bin_nutch dedup $CRAWL_PATH/crawldb
 
   if [ -n $SOLRURL ]; then
   echo Indexing $SEGMENT on SOLR index - $SOLRURL




svn commit: r1643412 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/conf/suffix-urlfilter.txt.template trunk/CHANGES.txt trunk/conf/suffix-urlfilter.txt.template

2014-12-05 Thread snagel
Author: snagel
Date: Fri Dec  5 19:53:35 2014
New Revision: 1643412

URL: http://svn.apache.org/r1643412
Log:
NUTCH-1877 Suffix URL filter to ignore query string by default

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/suffix-urlfilter.txt.template
nutch/trunk/CHANGES.txt
nutch/trunk/conf/suffix-urlfilter.txt.template

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1643412r1=1643411r2=1643412view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Dec  5 19:53:35 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.3-SNAPSHOT
 
+* NUTCH-1877 Suffix URL filter to ignore query string by default (markus via 
snagel)
+
 * NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel)
 
 * NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério 
Pereira Araújo, Mengying Wang, snagel)

Modified: nutch/branches/2.x/conf/suffix-urlfilter.txt.template
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/suffix-urlfilter.txt.template?rev=1643412r1=1643411r2=1643412view=diff
==
--- nutch/branches/2.x/conf/suffix-urlfilter.txt.template (original)
+++ nutch/branches/2.x/conf/suffix-urlfilter.txt.template Fri Dec  5 19:53:35 
2014
@@ -16,8 +16,19 @@
 
 # case-insensitive, allow unknown suffixes
 +I
-# uncomment the line below to filter on url path
-#+P
+
+# filter on URL path only
++P
+# comment out to filter on complete URL
+# but be aware that the pattern
+#.com
+#  will then reject
+#http://xyz.com
+#http://xyz.com/search?q=foo.com
+#  while the pattern
+#.mp3
+#  will not apply to (URLs will pass)
+#http://xyz.com/music.mp3?q=abc
 
 ### prohibit these
 # pictures

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1643412r1=1643411r2=1643412view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Dec  5 19:53:35 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1877 Suffix URL filter to ignore query string by default (markus via 
snagel)
+
 * NUTCH-1890 Major Typo in Documentation for Integrating Nutch and Solr (Boadu 
Akoto Charles Jnr, mattmann)
 
 * NUTCH-1887 Specify HTMLMapper to use in TikaParser (jnioche)

Modified: nutch/trunk/conf/suffix-urlfilter.txt.template
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/suffix-urlfilter.txt.template?rev=1643412r1=1643411r2=1643412view=diff
==
--- nutch/trunk/conf/suffix-urlfilter.txt.template (original)
+++ nutch/trunk/conf/suffix-urlfilter.txt.template Fri Dec  5 19:53:35 2014
@@ -2,8 +2,19 @@
 
 # case-insensitive, allow unknown suffixes
 +I
-# uncomment the line below to filter on url path
-#+P
+
+# filter on URL path only
++P
+# comment out to filter on complete URL
+# but be aware that the pattern
+#.com
+#  will then reject
+#http://xyz.com
+#http://xyz.com/search?q=foo.com
+#  while the pattern
+#.mp3
+#  will not apply to (URLs will pass)
+#http://xyz.com/music.mp3?q=abc
 
 ### prohibit these
 # pictures




svn commit: r1655169 - in /nutch/branches/2.x: CHANGES.txt src/plugin/parse-tika/ivy.xml src/plugin/parse-tika/plugin.xml

2015-01-27 Thread snagel
Author: snagel
Date: Tue Jan 27 21:45:39 2015
New Revision: 1655169

URL: http://svn.apache.org/r1655169
Log:
NUTCH-1893 Parse-tika failes to parse feed files

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/parse-tika/ivy.xml
nutch/branches/2.x/src/plugin/parse-tika/plugin.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1655169r1=1655168r2=1655169view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Jan 27 21:45:39 2015
@@ -2,7 +2,7 @@ Nutch Change Log
 
 Current Development 2.4-SNAPSHOT
 
-NUTCH-XX
+* NUTCH-1893 Parse-tika failes to parse feed files (Mengying Wang via snagel)
 
 Nutch 2.3 Release 08012015 (ddmm)
 Release Report - http://s.apache.org/nutch_2.3

Modified: nutch/branches/2.x/src/plugin/parse-tika/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/ivy.xml?rev=1655169r1=1655168r2=1655169view=diff
==
--- nutch/branches/2.x/src/plugin/parse-tika/ivy.xml (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/ivy.xml Tue Jan 27 21:45:39 2015
@@ -39,6 +39,7 @@
 dependency org=org.apache.tika name=tika-parsers rev=1.6 
conf=*-default
  exclude org=org.apache.tika name=tika-core /
 /dependency
+override module=rome rev=0.9/
   /dependencies
   
 /ivy-module

Modified: nutch/branches/2.x/src/plugin/parse-tika/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/plugin.xml?rev=1655169r1=1655168r2=1655169view=diff
==
--- nutch/branches/2.x/src/plugin/parse-tika/plugin.xml (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/plugin.xml Tue Jan 27 21:45:39 2015
@@ -55,7 +55,7 @@
   library name=poi-ooxml-3.11-beta2.jar/
   library name=poi-ooxml-schemas-3.11-beta2.jar/
   library name=poi-scratchpad-3.11-beta2.jar/
-  library name=rome-1.0.jar/
+  library name=rome-0.9.jar/
   library name=slf4j-api-1.6.1.jar/
   library name=tagsoup-1.2.1.jar/
   library name=tika-parsers-1.6.jar/




svn commit: r1651193 - in /nutch/trunk: CHANGES.txt build.xml

2015-01-12 Thread snagel
Author: snagel
Date: Mon Jan 12 20:45:16 2015
New Revision: 1651193

URL: http://svn.apache.org/r1651193
Log:
NUTCH-1881 ant target resolve-default to keep test libs

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1651193r1=1651192r2=1651193view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jan 12 20:45:16 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1881 ant target resolve-default to keep test libs (snagel)
+
 * NUTCH-1660 Index filter for Page's latitude and longitude (Yasin Kılınç, 
lewismc)
 
 * NUTCH-1140 index-more plugin, resetTitle creates multiple values in title 
field (Joe Liedtke, kaveh minooie via snagel)

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1651193r1=1651192r2=1651193view=diff
==
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Mon Jan 12 20:45:16 2015
@@ -468,13 +468,13 @@
   !-- == --
 
   !-- target: resolve  = --
-  target name=resolve-default depends=clean-lib description=-- resolve 
and retrieve dependencies with ivy
+  target name=resolve-default depends=clean-default-lib description=-- 
resolve and retrieve dependencies with ivy
 ivy:resolve file=${ivy.file} conf=default log=download-only/
 ivy:retrieve pattern=${build.lib.dir}/[artifact]-[revision].[ext] 
symlink=false log=quiet/
 antcall target=copy-libs/
   /target
 
-  target name=resolve-test depends=clean-lib, init description=-- 
resolve and retrieve dependencies with ivy
+  target name=resolve-test depends=clean-test-lib, init description=-- 
resolve and retrieve dependencies with ivy
 ivy:resolve file=${ivy.file} conf=test log=download-only/
 ivy:retrieve pattern=${test.build.lib.dir}/[artifact]-[revision].[ext] 
symlink=false log=quiet/
 antcall target=copy-libs/
@@ -822,8 +822,15 @@
   /target
 
   !-- target: clean-lib  === --
-  target name=clean-lib description=-- clean the project libraries 
directory (dependencies)
-delete includeemptydirs=true dir=${build.lib.dir}/
+  target name=clean-lib depends=clean-default-lib, clean-test-lib
+  description=-- clean the project libraries directories 
(dependencies: default + test)
+  /target
+  !-- target: clean-default-lib  
=== --
+  target name=clean-default-lib description=-- clean the project 
libraries directory (dependencies)
+delete includeemptydirs=true dir=${build.lib.dir}/
+  /target
+  !-- target: clean-test-lib  === 
--
+  target name=clean-test-lib description=-- clean the project test 
libraries directory (dependencies)
 delete includeemptydirs=true dir=${test.build.lib.dir}/
   /target
 




svn commit: r1650181 - in /nutch/trunk: CHANGES.txt src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java src/plugin/index-more/src/test/org/apache/nutch/indexer/more/Te

2015-01-07 Thread snagel
Author: snagel
Date: Wed Jan  7 22:25:18 2015
New Revision: 1650181

URL: http://svn.apache.org/r1650181
Log:
NUTCH-1140 index-more plugin, resetTitle creates multiple values in title field

Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1650181r1=1650180r2=1650181view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan  7 22:25:18 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1140 index-more plugin, resetTitle creates multiple values in title 
field (Joe Liedtke, kaveh minooie via snagel)
+
 * NUTCH-1904 Schema for Solr4 doesn't include _version_ field (mattmann)
 
 * NUTCH-1897 Easier debugging of plugin XML errors (markus)

Modified: 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1650181r1=1650180r2=1650181view=diff
==
--- 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Wed Jan  7 22:25:18 2015
@@ -289,7 +289,7 @@ public class MoreIndexingFilter implemen
 
   private NutchDocument resetTitle(NutchDocument doc, ParseData data, String 
url) {
 String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
-if (contentDisposition == null)
+if (contentDisposition == null || doc.getFieldValue(title) != null)
   return doc;
 
 for (int i=0; ipatterns.length; i++) {

Modified: 
nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1650181r1=1650180r2=1650181view=diff
==
--- 
nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 Wed Jan  7 22:25:18 2015
@@ -82,11 +82,21 @@ public class TestMoreIndexingFilter {
 MoreIndexingFilter filter = new MoreIndexingFilter();
 filter.setConf(conf);
 
-NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl(text, new ParseData(
-  new ParseStatus(), title, new Outlink[0], metadata)), new Text(
-http://www.example.com/;), new CrawlDatum(), new Inlinks());
+Text url = new Text(http://www.example.com/;);
+ParseImpl parseImpl = new ParseImpl(text, new ParseData(
+new ParseStatus(), title, new Outlink[0], metadata));
+
+NutchDocument doc = new NutchDocument();
+doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
 
 Assert.assertEquals(content-disposition not detected, filename.ext, 
doc.getFieldValue(title));
+
+/* NUTCH-1140: do not add second title to avoid a multi-valued title field 
*/
+doc = new NutchDocument();
+doc.add(title, title);
+doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
+Assert.assertEquals(do not add second title by content-disposition,
+title, doc.getFieldValue(title));
   }
 
   private void assertParts(String[] parts, int count, String... expected) {




svn commit: r1670442 - /nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java

2015-03-31 Thread snagel
Author: snagel
Date: Tue Mar 31 19:28:14 2015
New Revision: 1670442

URL: http://svn.apache.org/r1670442
Log:
NUTCH-1979 CrawlDbReader to implement Tool: fix unit test

Modified:
nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java?rev=1670442r1=1670441r2=1670442view=diff
==
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java 
(original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java Tue Mar 
31 19:28:14 2015
@@ -27,6 +27,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.After;
 import org.junit.Assert;
@@ -113,7 +114,7 @@ public class TestCrawlDbMerger {
   String url = it.next();
   LOG.fine(url= + url);
   CrawlDatum cd = expected.get(url);
-  CrawlDatum res = reader.get(crawlDb, url, conf);
+  CrawlDatum res = reader.get(crawlDb, url, new JobConf(conf));
   LOG.fine( -  + res);
   System.out.println(url= + url);
   System.out.println( cd  + cd);




svn commit: r1669692 - in /nutch: branches/2.x/ branches/2.x/conf/ branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ branches/2.x/src/plugin/protocol-httpclient/src/java/or

2015-03-27 Thread snagel
Author: snagel
Date: Fri Mar 27 21:42:35 2015
New Revision: 1669692

URL: http://svn.apache.org/r1669692
Log:
NUTCH-1941 Optional rolling http.agent.names

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml

nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java

nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml

nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java

nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1669692r1=1669691r2=1669692view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Mar 27 21:42:35 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.4-SNAPSHOT
 
+* NUTCH-1941 Optional rolling http.agent.name's (Asitang Mishra, lewismc via 
snagel)
+
 * NUTCH-1925 Upgrade to Apache Tika 1.7 palsulich.p2.v2.patch (Tyler Palsulich 
via lewismc)
 
 * NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1669692r1=1669691r2=1669692view=diff
==
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Fri Mar 27 21:42:35 2015
@@ -162,6 +162,26 @@
 /property
 
 property
+  namehttp.agent.rotate/name
+  valuefalse/value
+  description
+If true, instead of http.agent.name, alternating agent names are
+chosen from a list provided via http.agent.rotate.file.
+  /description
+/property
+
+property
+  namehttp.agent.rotate.file/name
+  valueagents.txt/value
+  description
+File containing alternative user agent names to be used instead of
+http.agent.name on a rotating basis if http.agent.rotate is true.
+Each line of the file should contain exactly one agent
+specification including name, version, description, URL, etc.
+  /description
+/property
+
+property
   namehttp.agent.host/name
   value/value
   descriptionName or IP address of the host on which the Nutch crawler

Modified: 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1669692r1=1669691r2=1669692view=diff
==
--- 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Fri Mar 27 21:42:35 2015
@@ -17,16 +17,22 @@
 package org.apache.nutch.protocol.http.api;
 
 // JDK imports
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.Reader;
 import java.net.URL;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.concurrent.ThreadLocalRandom;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
@@ -53,6 +59,8 @@ public abstract class HttpBase implement
 
   private HttpRobotRulesParser robots = null;
 
+  private ArrayListString userAgentNames = null;
+
   /** The proxy hostname. */
   protected String proxyHost = null;
 
@@ -132,6 +140,45 @@ public abstract class HttpBase implement
 this.responseTime = conf.getBoolean(http.store.responsetime, true);
 this.robots.setConf(conf);
 
+// NUTCH-1941: read list of alternating agent names
+if (conf.getBoolean(http.agent.rotate, false)) {
+  String agentsFile = conf.get(http.agent.rotate.file, agents.txt);
+  BufferedReader br = null;
+  try {
+Reader reader = conf.getConfResourceAsReader(agentsFile);
+br = new BufferedReader(reader);
+userAgentNames = new ArrayListString();
+String word = ;
+while ((word = br.readLine()) != null) {
+  if (!word.trim().isEmpty

svn commit: r1678824 - in /nutch/trunk: CHANGES.txt src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java

2015-05-11 Thread snagel
Author: snagel
Date: Mon May 11 21:04:59 2015
New Revision: 1678824

URL: http://svn.apache.org/r1678824
Log:
NUTCH-1998 Add support for user-defined file extension to 
CommonCrawlDataDumper: fix unit test

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1678824r1=1678823r2=1678824view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon May 11 21:04:59 2015
@@ -2,7 +2,7 @@ Nutch Change Log
  
 Nutch Current Development 1.11-SNAPSHOT
 
-* NUTCH-1988 Add support for user-defined file extension to 
CommonCrawlDataDumper (totaro via mattmann)
+* NUTCH-1998 Add support for user-defined file extension to 
CommonCrawlDataDumper (totaro via mattmann)
 
 * NUTCH-1873 Solr IndexWriter/Job to report number of docs indexed. (snagel 
via lewismc)
  
@@ -54,8 +54,6 @@ Release Report: http://s.apache.org/nutc
 
 * NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro 
via mattmann)
 
-* NUTCH-1988 Make nested output directory dump optional (Michael Joyce via 
mattmann)
-
 * NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of 
RobotRules parsing (mattmann, snagel)
 
 * NUTCH-1986 Clarify Elastic Search Indexer Plugin Settings (Michael Joyce via 
mattmann)

Modified: 
nutch/trunk/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java?rev=1678824r1=1678823r2=1678824view=diff
==
--- nutch/trunk/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java 
(original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java 
Mon May 11 21:04:59 2015
@@ -101,20 +101,16 @@ public class TestCommonCrawlDataDumper {
 
CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(
new CommonCrawlConfig());
-   dumper.dump(tempDir, sampleSegmentDir, false, null, false);
+   dumper.dump(tempDir, sampleSegmentDir, false, null, false, );
 
CollectionFile tempFiles = FileUtils.listFiles(tempDir,
FileFilterUtils.fileFileFilter(),
FileFilterUtils.directoryFileFilter());
 
-   boolean hasAll = true;
for (String expectedFileName : crawledFiles) {
-   if (!hasFile(expectedFileName, tempFiles)) {
-   hasAll = false;
-   break;
-   }
+ assertTrue(Missed file  + expectedFileName +  in dump, 
+ hasFile(expectedFileName, tempFiles));
}
-   assertTrue(hasAll);
 
}
 




svn commit: r1680110 - in /nutch/trunk: CHANGES.txt conf/log4j.properties

2015-05-18 Thread snagel
Author: snagel
Date: Mon May 18 21:39:23 2015
New Revision: 1680110

URL: http://svn.apache.org/r1680110
Log:
NUTCH-2013 Fetcher: missing logs fetching ... on stdout

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/log4j.properties

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1680110r1=1680109r2=1680110view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon May 18 21:39:23 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2013 Fetcher: missing logs fetching ... on stdout (snagel)
+
 * NUTCH-2014 Fetcher hang-up on completion (snagel)
 
 * NUTCH-2011 Endpoint to support realtime JSON output from the fetcher (Sujen 
Shah via mattmann)

Modified: nutch/trunk/conf/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1680110r1=1680109r2=1680110view=diff
==
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Mon May 18 21:39:23 2015
@@ -28,6 +28,11 @@ log4j.logger.org.apache.nutch.crawl.Craw
 log4j.logger.org.apache.nutch.crawl.Injector=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.Generator=INFO,cmdstdout
 log4j.logger.org.apache.nutch.fetcher.Fetcher=INFO,cmdstdout
+log4j.logger.org.apache.nutch.fetcher.FetcherThread=INFO,cmdstdout
+log4j.logger.org.apache.nutch.fetcher.FetcherItem=INFO,cmdstdout
+log4j.logger.org.apache.nutch.fetcher.FetcherItemQueue=INFO,cmdstdout
+log4j.logger.org.apache.nutch.fetcher.FetcherItemQueues=INFO,cmdstdout
+log4j.logger.org.apache.nutch.fetcher.QueueFeeder=INFO,cmdstdout
 log4j.logger.org.apache.nutch.parse.ParseSegment=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.CrawlDbReader=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.CrawlDbMerger=INFO,cmdstdout




svn commit: r1680109 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java

2015-05-18 Thread snagel
Author: snagel
Date: Mon May 18 21:35:03 2015
New Revision: 1680109

URL: http://svn.apache.org/r1680109
Log:
NUTCH-2014 Fetcher hang-up on completion

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1680109r1=1680108r2=1680109view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon May 18 21:35:03 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2014 Fetcher hang-up on completion (snagel)
+
 * NUTCH-2011 Endpoint to support realtime JSON output from the fetcher (Sujen 
Shah via mattmann)
 
 * NUTCH-2006 IndexingFiltersChecker to take custom metadata as input (jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1680109r1=1680108r2=1680109view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon May 18 
21:35:03 2015
@@ -214,7 +214,7 @@ public class Fetcher extends NutchTool i
 
 for (int i = 0; i  threadCount; i++) { // spawn threads
   FetcherThread t = new FetcherThread(getConf(), getActiveThreads(), 
fetchQueues, 
-  feeder, spinWaiting, lastRequestStart, reporter, activeThreads, 
segmentName,
+  feeder, spinWaiting, lastRequestStart, reporter, errors, segmentName,
   parsing, output, storingContent, pages, bytes);
   fetcherThreads.add(t);
   t.start();




svn commit: r1674399 - in /nutch/trunk: ./ conf/ src/java/org/apache/nutch/protocol/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-ftp/src/java/org/apache/nutch/

2015-04-17 Thread snagel
Author: snagel
Date: Fri Apr 17 20:49:19 2015
New Revision: 1674399

URL: http://svn.apache.org/r1674399
Log:
NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules 
parsing

Removed:
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRules.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/log4j.properties
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java

nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java

nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674399r1=1674398r2=1674399view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Apr 17 20:49:19 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of 
RobotRules parsing (mattmann, snagel)
+
 * NUTCH-1986 Clarify Elastic Search Indexer Plugin Settings (Michael Joyce via 
mattmann)
 
 * NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via 
mattmann)

Modified: nutch/trunk/conf/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1674399r1=1674398r2=1674399view=diff
==
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Fri Apr 17 20:49:19 2015
@@ -54,6 +54,7 @@ log4j.logger.org.apache.nutch.indexer.In
 log4j.logger.org.apache.nutch.tools.FreeGenerator=INFO,cmdstdout
 log4j.logger.org.apache.nutch.util.domain.DomainStatistics=INFO,cmdstdout
 log4j.logger.org.apache.nutch.tools.CrawlDBScanner=INFO,cmdstdout
+log4j.logger.org.apache.nutch.protocol.RobotRulesParser=INFO,cmdstdout
 log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN
 
 log4j.logger.org.apache.nutch=INFO

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1674399r1=1674398r2=1674399view=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Apr 17 20:49:19 2015
@@ -118,6 +118,15 @@
 /property
 
 property
+  namehttp.robot.rules.whitelist/name
+  value/value
+  descriptionComma separated list of hostnames or IP addresses to ignore 
+  robot rules parsing for. Use with care and only if you are explicitly
+  allowed by the site owner to ignore the site's robots.txt!
+  /description
+/property
+
+property
   namehttp.robots.403.allow/name
   valuetrue/value
   descriptionSome servers return HTTP status 403 (Forbidden) if

Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1674399r1=1674398r2=1674399view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri 
Apr 17 20:49:19 2015
@@ -20,10 +20,15 @@ package org.apache.nutch.protocol;
 // JDK imports
 import java.io.File;
 import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
 import java.io.LineNumberReader;
+import java.net.MalformedURLException;
 import java.net.URL;
-import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.Hashtable;
+import java.util.Set;
 import java.util.StringTokenizer;
 
 // Commons Logging imports
@@ -32,10 +37,11 @@ import org.slf4j.LoggerFactory;
 
 // Nutch imports
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.io.Text;
-
-import com.google.common.io.Files;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.NutchConfiguration;
 
 import crawlercommons.robots.BaseRobotRules;
 import crawlercommons.robots.SimpleRobotRules;
@@ -46,8 +52,11 @@ import crawlercommons.robots.SimpleRobot
  * This class uses crawler-commons for handling the parsing of
  * {@code robots.txt} files. It emits SimpleRobotRules objects, which describe
  * the download permissions as described in SimpleRobotRulesParser.
+ * 
+ * Protocol-specific implementations have to implement the method
+ * {@link getRobotRulesSet}.
  */
-public abstract class RobotRulesParser implements Configurable {
+public abstract class RobotRulesParser implements Tool {
 
   public static final Logger LOG = LoggerFactory
   .getLogger

svn commit: r1674581 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/parse/ParseSegment.java src/java/org/apache/nutch/segment/SegmentChecker.java

2015-04-18 Thread snagel
Author: snagel
Date: Sat Apr 18 20:41:13 2015
New Revision: 1674581

URL: http://svn.apache.org/r1674581
Log:
NUTCH-1854 bin/crawl fails with a parsing fetcher

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674581r1=1674580r2=1674581view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 18 20:41:13 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel)
+
 * NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro 
via mattmann)
 
 * NUTCH-1988 Make nested output directory dump optional (Michael Joyce via 
mattmann)
@@ -12,7 +14,7 @@ Nutch Current Development 1.10-SNAPSHOT
 
 * NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via 
mattmann)
 
-* NUTCH-1911 Imeprove DomainStatistics tool command line parsing (Michael 
Joyce via mattmann)
+* NUTCH-1911 Improve DomainStatistics tool command line parsing (Michael Joyce 
via mattmann)
 
 * NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel)
 

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1674581r1=1674580r2=1674581view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Sat Apr 18 
20:41:13 2015
@@ -22,6 +22,7 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.segment.SegmentChecker;
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.*;
@@ -32,6 +33,7 @@ import org.apache.nutch.net.protocols.Re
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.nutch.util.*;
 import org.apache.hadoop.fs.Path;
 
@@ -198,6 +200,11 @@ public class ParseSegment extends Config
   }
 
   public void parse(Path segment) throws IOException {
+   if (SegmentChecker.isParsed(segment, FileSystem.get(getConf( {
+ LOG.warn(Segment:  + segment
+ +  already parsed!! Skipped parsing this 
segment!!); // NUTCH-1854
+ return;
+   }
 
 SimpleDateFormat sdf = new SimpleDateFormat(-MM-dd HH:mm:ss);
 long start = System.currentTimeMillis();

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java?rev=1674581r1=1674580r2=1674581view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Sat Apr 
18 20:41:13 2015
@@ -115,4 +115,16 @@ public class SegmentChecker {
 }
   }
 
-}
\ No newline at end of file
+  /**
+   * Check the segment to see if it is has been parsed before.
+   */
+  public static boolean isParsed(Path segment, FileSystem fs)
+ throws IOException {
+
+   if (fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME)))
+ return true;
+   return false;
+
+  } 
+
+}




svn commit: r1672939 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/ivy/ivy.xml trunk/CHANGES.txt trunk/ivy/ivy.xml

2015-04-11 Thread snagel
Author: snagel
Date: Sat Apr 11 22:07:52 2015
New Revision: 1672939

URL: http://svn.apache.org/r1672939
Log:
NUTCH-1981 Upgrade to icu4j 55.1

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/ivy/ivy.xml
nutch/trunk/CHANGES.txt
nutch/trunk/ivy/ivy.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1672939r1=1672938r2=1672939view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Apr 11 22:07:52 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.4-SNAPSHOT
 
+* NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel)
+
 * NUTCH-1944 Index HTML raw content (meabed via mattmann)
 
 * NUTCH-1941 Optional rolling http.agent.name's (Asitang Mishra, lewismc via 
snagel)

Modified: nutch/branches/2.x/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1672939r1=1672938r2=1672939view=diff
==
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Sat Apr 11 22:07:52 2015
@@ -54,7 +54,7 @@
   exclude org=org.mortbay.jetty name=jsp-* /
 /dependency
 
-dependency org=com.ibm.icu name=icu4j rev=4.0.1 /
+dependency org=com.ibm.icu name=icu4j rev=55.1 /
 dependency org=org.apache.tika name=tika-core rev=1.7 /
 dependency org=com.googlecode.juniversalchardet 
name=juniversalchardet rev=1.0.3/
 

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1672939r1=1672938r2=1672939view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 11 22:07:52 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel)
+
 * NUTCH-1960 JUnit test for dump method of CommonCrawlDataDumper (Giuseppe 
Totaro via mattmann)
 
 * NUTCH-1983 CommonCrawlDumper and FileDumper don't dump correct JSON 
(mattmann)

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1672939r1=1672938r2=1672939view=diff
==
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Sat Apr 11 22:07:52 2015
@@ -62,7 +62,7 @@
/dependency
 
dependency org=org.apache.tika name=tika-core rev=1.7 /
-   dependency org=com.ibm.icu name=icu4j rev=4.0.1 /
+   dependency org=com.ibm.icu name=icu4j rev=55.1 /
 
dependency org=xerces name=xercesImpl rev=2.9.1 /
dependency org=xerces name=xmlParserAPIs rev=2.6.2 /




svn commit: r1687604 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/LinkDb.java

2015-06-25 Thread snagel
Author: snagel
Date: Thu Jun 25 18:41:26 2015
New Revision: 1687604

URL: http://svn.apache.org/r1687604
Log:
NUTCH-2000 Link inversion fails with .locked already exists

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1687604r1=1687603r2=1687604view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jun 25 18:41:26 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2000 Link inversion fails with .locked already exists (jnioche, snagel)
+
 * NUTCH-2036 Adding some continuous crawl goodies to the crawl script (jorge, 
snagel)
 
 * NUTCH-2039 Relevance based scoring filter (Sujen Shah, lewismc via mattmann)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1687604r1=1687603r2=1687604view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Jun 25 18:41:26 
2015
@@ -196,6 +196,7 @@ public class LinkDb extends NutchTool im
  job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
   LOG.warn(LinkDb: internal and external links are ignored! 
   + Nothing to do, actually. Exiting.);
+  LockUtil.removeLockFile(fs, lock);
   return;
 }
 




svn commit: r1682103 - in /nutch/trunk: CHANGES.txt src/bin/nutch

2015-05-27 Thread snagel
Author: snagel
Date: Wed May 27 19:31:51 2015
New Revision: 1682103

URL: http://svn.apache.org/r1682103
Log:
NUTCH-2007 add test libs to classpath of bin/nutch junit

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/bin/nutch

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682103r1=1682102r2=1682103view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 27 19:31:51 2015
@@ -2,7 +2,9 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
-* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist
+* NUTCH-2007 add test libs to classpath of bin/nutch junit (snagel)
+
+* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist (totaro)
 
 * NUTCH-2013 Fetcher: missing logs fetching ... on stdout (snagel)
 

Modified: nutch/trunk/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1682103r1=1682102r2=1682103view=diff
==
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Wed May 27 19:31:51 2015
@@ -270,6 +270,11 @@ elif [ $COMMAND = plugin ] ; then
   CLASS=org.apache.nutch.plugin.PluginRepository
 elif [ $COMMAND = junit ] ; then
   CLASSPATH=$CLASSPATH:$NUTCH_HOME/test/classes/
+  if $local; then
+for f in $NUTCH_HOME/test/lib/*.jar; do
+  CLASSPATH=${CLASSPATH}:$f;
+done
+  fi
   CLASS=org.junit.runner.JUnitCore
 elif [ $COMMAND = startserver ] ; then
   CLASS=org.apache.nutch.service.NutchServer




svn commit: r1691436 - /nutch/trunk/CHANGES.txt

2015-07-16 Thread snagel
Author: snagel
Date: Thu Jul 16 19:52:00 2015
New Revision: 1691436

URL: http://svn.apache.org/r1691436
Log:
remove duplicate entries

Modified:
nutch/trunk/CHANGES.txt

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1691436r1=1691435r2=1691436view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul 16 19:52:00 2015
@@ -100,16 +100,6 @@ Release Report: http://s.apache.org/nutc
 
 * NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel)
 
-* NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro 
via mattmann)
-
-* NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of 
RobotRules parsing (mattmann, snagel)
-
-* NUTCH-1986 Clarify Elastic Search Indexer Plugin Settings (Michael Joyce via 
mattmann)
-
-* NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via 
mattmann)
-
-* NUTCH-1911 Improve DomainStatistics tool command line parsing (Michael Joyce 
via mattmann)
-
 * NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel)
 
 * NUTCH-1960 JUnit test for dump method of CommonCrawlDataDumper (Giuseppe 
Totaro via mattmann)




svn commit: r1714655 - in /nutch/branches/2.x: CHANGES.txt conf/schema.xml

2015-11-16 Thread snagel
Author: snagel
Date: Mon Nov 16 20:29:33 2015
New Revision: 1714655

URL: http://svn.apache.org/viewvc?rev=1714655=rev
Log:
NUTCH-2130 copyField rawcontent creates error within schema.xml

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/schema.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1714655=1714654=1714655=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Nov 16 20:29:33 2015
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 2.3.1 Release 22092015 (ddmm)
 Release Report - http://s.apache.org/nutch_2.3.1
 
+* NUTCH-2130 copyField rawcontent creates error within schema.xml (Sherban 
Drulea, lewismc, snagel)
+
 * NUTCH-2018 Ensure that the Docker containers for Nutch 2.X are part of the 
Release Management Documentation (lewismc)
 
 * NUTCH-2105 Update Nutch Cassandra Dockerfile to work with Gora Nutch 2.3.1 
(lewismc)

Modified: nutch/branches/2.x/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1714655=1714654=1714655=diff
==
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Mon Nov 16 20:29:33 2015
@@ -32,6 +32,7 @@
 
 
 
+
 
 
 
+
+
+
+
  
  id
  text
@@ -367,7 +374,6 @@
 or to add multiple fields to the same field for easier/faster 
searching.  -->
 
  
- 
  
  
  




svn commit: r1707360 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/FetcherThread.java

2015-10-07 Thread snagel
Author: snagel
Date: Wed Oct  7 19:02:42 2015
New Revision: 1707360

URL: http://svn.apache.org/viewvc?rev=1707360=rev
Log:
NUTCH-2124 Fetcher following same redirect again and again

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1707360=1707359=1707360=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Oct  7 19:02:42 2015
@@ -2,6 +2,8 @@ Nutch Change Log

 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2124 Fetcher following same redirect again and again (Yogendra Kumar 
Soni via snagel)
+
 * NUTCH-2123 Seed List REST API returns Text but headers indicate/require JSON
   (Aron Ahmadia, Sujen Shah via mattmann)
 

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java?rev=1707360=1707359=1707360=diff
==
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java Wed Oct  7 
19:02:42 2015
@@ -325,7 +325,7 @@ public class FetcherThread extends Threa
 newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME,
 Fetcher.CONTENT_REDIR);
 if (redirUrl != null) {
-  queueRedirect(redirUrl, fit);
+  fit = queueRedirect(redirUrl, fit);
 }
   }
   break;
@@ -346,7 +346,7 @@ public class FetcherThread extends Threa
   Text redirUrl = handleRedirect(fit.url, fit.datum, urlString,
   newUrl, temp, Fetcher.PROTOCOL_REDIR);
   if (redirUrl != null) {
-queueRedirect(redirUrl, fit);
+fit = queueRedirect(redirUrl, fit);
   } else {
 // stop redirecting
 redirecting = false;
@@ -485,7 +485,7 @@ public class FetcherThread extends Threa
 }
   }
 
-  private void queueRedirect(Text redirUrl, FetchItem fit)
+  private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
   throws ScoringFilterException {
 CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
 fit.datum.getFetchInterval(), fit.datum.getScore());
@@ -506,6 +506,7 @@ public class FetcherThread extends Threa
   reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect",
   1);
 }
+return fit;
   }
 
   private void logError(Text url, String message) {




svn commit: r1704425 - in /nutch/trunk: ./ src/plugin/lib-selenium/ src/plugin/protocol-interactiveselenium/ src/plugin/protocol-selenium/

2015-09-21 Thread snagel
Author: snagel
Date: Mon Sep 21 21:14:55 2015
New Revision: 1704425

URL: http://svn.apache.org/viewvc?rev=1704425=rev
Log:
NUTCH-2106 Runtime to contain Selenium and dependencies only once

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/lib-selenium/build-ivy.xml
nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt
nutch/trunk/src/plugin/lib-selenium/ivy.xml
nutch/trunk/src/plugin/lib-selenium/plugin.xml
nutch/trunk/src/plugin/protocol-interactiveselenium/build-ivy.xml
nutch/trunk/src/plugin/protocol-interactiveselenium/ivy.xml
nutch/trunk/src/plugin/protocol-interactiveselenium/plugin.xml
nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
nutch/trunk/src/plugin/protocol-selenium/ivy.xml
nutch/trunk/src/plugin/protocol-selenium/plugin.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1704425=1704424=1704425=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Sep 21 21:14:55 2015
@@ -2,6 +2,8 @@ Nutch Change Log

 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2106 Runtime to contain Selenium and dependencies only once (snagel)
+
 * NUTCH-2104 Add documentation to the protocol-selenium plugin Readme file 
   re: selenium grid implementation (Kim Whitehall via mattmann)
 

Modified: nutch/trunk/src/plugin/lib-selenium/build-ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build-ivy.xml?rev=1704425=1704424=1704425=diff
==
--- nutch/trunk/src/plugin/lib-selenium/build-ivy.xml (original)
+++ nutch/trunk/src/plugin/lib-selenium/build-ivy.xml Mon Sep 21 21:14:55 2015
@@ -48,7 +48,7 @@
 
 
   
-
+
   
 
 

Modified: nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt?rev=1704425=1704424=1704425=diff
==
--- nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt (original)
+++ nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt Mon Sep 21 
21:14:55 2015
@@ -1,6 +1,9 @@
 1. Upgrade various driver versions dependency in 
src/plugin/lib-selenium/ivy.xml
 
-2. Upgrade Tika's own dependencies in src/plugin/lib-selenium/plugin.xml
-   To get the list of dependencies and their versions execute:
-   $ ant -f ./build-ivy.xml
-   $ ls lib | sed 's/^/  /g'
+2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
+
+   To get a list of dependencies and their versions execute:
+$ ant -f ./build-ivy.xml
+$ ls lib | sed 's/^/ \n   \n <\/library>/g'
+
+   Note that all dependent libraries are exported for a "library" plugin 
("lib-selenium").

Modified: nutch/trunk/src/plugin/lib-selenium/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1704425=1704424=1704425=diff
==
--- nutch/trunk/src/plugin/lib-selenium/ivy.xml (original)
+++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Mon Sep 21 21:14:55 2015
@@ -27,7 +27,7 @@
   
 
   
-
+
   
 
   

Modified: nutch/trunk/src/plugin/lib-selenium/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1704425=1704424=1704425=diff
==
--- nutch/trunk/src/plugin/lib-selenium/plugin.xml (original)
+++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Mon Sep 21 21:14

svn commit: r1718678 - in /nutch/trunk: conf/nutch-default.xml default.properties src/bin/nutch

2015-12-08 Thread snagel
Author: snagel
Date: Tue Dec  8 19:18:19 2015
New Revision: 1718678

URL: http://svn.apache.org/viewvc?rev=1718678=rev
Log:
Update Nutch trunk for new development: 1.11 -> 1.12

Modified:
nutch/trunk/conf/nutch-default.xml
nutch/trunk/default.properties
nutch/trunk/src/bin/nutch

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1718678=1718677=1718678=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Dec  8 19:18:19 2015
@@ -164,7 +164,7 @@
 
 
   http.agent.version
-  Nutch-1.11-SNAPSHOT
+  Nutch-1.12-SNAPSHOT
   A version string to advertise in the User-Agent 
header.
 

Modified: nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1718678=1718677=1718678=diff
==
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Tue Dec  8 19:18:19 2015
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 name=apache-nutch
-version=1.11-SNAPSHOT
+version=1.12-SNAPSHOT
 final.name=${name}-${version}
 year=2015
 

Modified: nutch/trunk/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1718678=1718677=1718678=diff
==
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Tue Dec  8 19:18:19 2015
@@ -53,7 +53,7 @@ done
 
 # if no args specified, show usage
 if [ $# = 0 ]; then
-  echo "nutch 1.11"
+  echo "nutch 1.12"
   echo "Usage: nutch COMMAND"
   echo "where COMMAND is one of:"
   echo "  readdbread / dump crawl db"




svn commit: r1717537 - in /nutch/branches/2.x: CHANGES.txt src/plugin/subcollection/plugin.xml src/plugin/urlnormalizer-regex/plugin.xml

2015-12-01 Thread snagel
Author: snagel
Date: Tue Dec  1 21:17:14 2015
New Revision: 1717537

URL: http://svn.apache.org/viewvc?rev=1717537=rev
Log:
NUTCH-2107 plugin.xml to validate against plugin.dtd

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/subcollection/plugin.xml
nutch/branches/2.x/src/plugin/urlnormalizer-regex/plugin.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1717537=1717536=1717537=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Dec  1 21:17:14 2015
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 2.3.1 Release 22092015 (ddmm)
 Release Report - http://s.apache.org/nutch_2.3.1
 
+* NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel)
+
 * NUTCH-2130 copyField rawcontent creates error within schema.xml (Sherban 
Drulea, lewismc, snagel)
 
 * NUTCH-2018 Ensure that the Docker containers for Nutch 2.X are part of the 
Release Management Documentation (lewismc)

Modified: nutch/branches/2.x/src/plugin/subcollection/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/subcollection/plugin.xml?rev=1717537=1717536=1717537=diff
==
--- nutch/branches/2.x/src/plugin/subcollection/plugin.xml (original)
+++ nutch/branches/2.x/src/plugin/subcollection/plugin.xml Tue Dec  1 21:17:14 
2015
@@ -21,16 +21,16 @@
version="1.0.0"
provider-name="apache.org">
 
-   
-  
-   
-

   
  
   


+   
+  
+   
+


Modified: nutch/branches/2.x/src/plugin/urlnormalizer-regex/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/plugin.xml?rev=1717537=1717536=1717537=diff
==
--- nutch/branches/2.x/src/plugin/urlnormalizer-regex/plugin.xml (original)
+++ nutch/branches/2.x/src/plugin/urlnormalizer-regex/plugin.xml Tue Dec  1 
21:17:14 2015
@@ -28,7 +28,7 @@

 

-
+  

 


svn commit: r1717536 - in /nutch/trunk: CHANGES.txt src/plugin/subcollection/plugin.xml src/plugin/urlnormalizer-regex/plugin.xml

2015-12-01 Thread snagel
Author: snagel
Date: Tue Dec  1 21:15:21 2015
New Revision: 1717536

URL: http://svn.apache.org/viewvc?rev=1717536=rev
Log:
NUTCH-2107 plugin.xml to validate against plugin.dtd

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/subcollection/plugin.xml
nutch/trunk/src/plugin/urlnormalizer-regex/plugin.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1717536=1717535=1717536=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Dec  1 21:15:21 2015
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 1.11 Release 25/10/2015 (dd/mm/)
 Release Report: http://s.apache.org/nutch11
 
+* NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel)
+
 * NUTCH-2177 Generator produces only one partition even in distributed mode 
(jnioche, snagel)
 
 * NUTCH-2158 Upgrade to Tika 1.11 (jnioche, snagel)

Modified: nutch/trunk/src/plugin/subcollection/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/plugin.xml?rev=1717536=1717535=1717536=diff
==
--- nutch/trunk/src/plugin/subcollection/plugin.xml (original)
+++ nutch/trunk/src/plugin/subcollection/plugin.xml Tue Dec  1 21:15:21 2015
@@ -21,16 +21,16 @@
version="1.0.0"
provider-name="apache.org">
 
-   
-  
-   
-

   
  
   


+   
+  
+   
+


Modified: nutch/trunk/src/plugin/urlnormalizer-regex/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/plugin.xml?rev=1717536=1717535=1717536=diff
==
--- nutch/trunk/src/plugin/urlnormalizer-regex/plugin.xml (original)
+++ nutch/trunk/src/plugin/urlnormalizer-regex/plugin.xml Tue Dec  1 21:15:21 
2015
@@ -28,7 +28,7 @@

 

-
+  

 


svn commit: r1718223 - in /nutch/trunk: CHANGES.txt conf/contenttype-mapping.txt.template src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

2015-12-06 Thread snagel
Author: snagel
Date: Sun Dec  6 21:14:06 2015
New Revision: 1718223

URL: http://svn.apache.org/viewvc?rev=1718223=rev
Log:
NUTCH-2172 index-more: document format of contenttype-mapping.txt

Added:
nutch/trunk/conf/contenttype-mapping.txt.template
Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1718223=1718222=1718223=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Dec  6 21:14:06 2015
@@ -1,5 +1,7 @@
 Nutch Change Log
-
+
+* NUTCH-2172 index-more: document format of contenttype-mapping.txt (Nicola 
Tonellotto, snagel)
+
 Nutch 1.11 Release 03/12/2015 (dd/mm/)
 Release Report: http://s.apache.org/nutch11
 

Added: nutch/trunk/conf/contenttype-mapping.txt.template
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/contenttype-mapping.txt.template?rev=1718223=auto
==
--- nutch/trunk/conf/contenttype-mapping.txt.template (added)
+++ nutch/trunk/conf/contenttype-mapping.txt.template Sun Dec  6 21:14:06 2015
@@ -0,0 +1,22 @@
+#
+# Mapping of detected content types (MIME types) to custom types (target types)
+# used by the plugin index-more when filling the index field `type'.
+#
+# Note: The mappings defined in this file are only active if the property
+# `moreIndexingFilter.mapMimeTypes' is true.
+#
+# Format (tab-separated plain text, comment lines start with `#'):
+#
+# [  ...]
+#
+# Examples (comment in to activate):
+#
+# map XHTML to HTML
+#text/html application/xhtml+xml
+#
+# Map XHTML and HTML to a custom type "web page"
+#web page  text/html   application/xhtml+xml
+#
+# map various office document formats to a custom type "office document"
+#office document   application/vnd.oasis.opendocument.text 
application/x-tika-msoffice application/msword
+#

Modified: 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1718223=1718222=1718223=diff
==
--- 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Sun Dec  6 21:14:06 2015
@@ -312,10 +312,12 @@ public class MoreIndexingFilter implemen
   }
 
   private void readConfiguration() throws IOException {
+LOG.info("Reading content type mappings from file 
contenttype-mapping.txt");
 BufferedReader reader = new BufferedReader(
 conf.getConfResourceAsReader("contenttype-mapping.txt"));
 String line;
 String parts[];
+boolean formatWarningShown = false;
 
 mimeMap = new HashMap<String, String>();
 
@@ -329,6 +331,12 @@ public class MoreIndexingFilter implemen
   for (int i = 1; i < parts.length; i++) {
 mimeMap.put(parts[i].trim(), parts[0].trim());
   }
+} else {
+  LOG.warn("Wrong format of line: {}", line);
+  if (!formatWarningShown) {
+LOG.warn("Expected format:[ 
 ...]");
+formatWarningShown = true;
+  }
 }
   }
 }




svn commit: r1718718 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java trunk/CHANGES.txt trunk/src/plugin/parse-html/src/jav

2015-12-08 Thread snagel
Author: snagel
Date: Tue Dec  8 21:45:47 2015
New Revision: 1718718

URL: http://svn.apache.org/viewvc?rev=1718718=rev
Log:
NUTCH-2042 parse-html increase chunk size used to detect charset

Modified:
nutch/branches/2.x/CHANGES.txt

nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1718718=1718717=1718718=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Dec  8 21:45:47 2015
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 2.3.1 Release 22092015 (ddmm)
 Release Report - http://s.apache.org/nutch_2.3.1
 
+* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
+
 * NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel)
 
 * NUTCH-2130 copyField rawcontent creates error within schema.xml (Sherban 
Drulea, lewismc, snagel)

Modified: 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1718718=1718717=1718718=diff
==
--- 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 Tue Dec  8 21:45:47 2015
@@ -27,6 +27,7 @@ import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -67,7 +68,8 @@ public class HtmlParser implements Parse
   // I used 1000 bytes at first, but found that some documents have
   // meta tag well past the first 1000 bytes.
   // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
-  private static final int CHUNK_SIZE = 2000;
+  // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+  private static final int CHUNK_SIZE = 8192;
 
   // NUTCH-1006 Meta equiv with single quotes not accepted
   private static Pattern metaPattern = Pattern.compile(
@@ -111,14 +113,8 @@ public class HtmlParser implements Parse
 // to just inflate each byte to a 16-bit value by padding.
 // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
 // {U+0041, U+0082, U+00B7}.
-String str = "";
-try {
-  str = new String(content.array(), content.arrayOffset()
-  + content.position(), length, Charset.forName("ASCII").toString());
-} catch (UnsupportedEncodingException e) {
-  // code should never come here, but just in case...
-  return null;
-}
+String str = new String(content.array(), content.arrayOffset()
++ content.position(), length, StandardCharsets.US_ASCII);
 
 Matcher metaMatcher = metaPattern.matcher(str);
 String encoding = null;

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1718718=1718717=1718718=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Dec  8 21:45:47 2015
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
+
 * NUTCH-2172 index-more: document format of contenttype-mapping.txt (Nicola 
Tonellotto, snagel)
 
 Nutch 1.11 Release 03/12/2015 (dd/mm/)

Modified: 
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1718718=1718717=1718718=diff
==
--- 
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 Tue Dec  8 21:45:47 2015
@@ -21,7 +21,7 @@ import java.util.ArrayList;
 import java.util.Map;
 import java.net.URL;
 import java.net.MalformedURLException;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.io.*;
 import java.util.regex.*;
 
@@ -30,10 +30,8 @@ import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.w3c.dom.*;
 import org.apache.html.dom.*;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.protocol.Content;
@@ -48,7 +

svn commit: r1723851 - in /nutch/branches/2.x: CHANGES.txt src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

2016-01-09 Thread snagel
Author: snagel
Date: Sat Jan  9 13:01:31 2016
New Revision: 1723851

URL: http://svn.apache.org/viewvc?rev=1723851=rev
Log:
NUTCH-2168 Parse-tika fails to retrieve parser

Modified:
nutch/branches/2.x/CHANGES.txt

nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1723851=1723850=1723851=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Jan  9 13:01:31 2016
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 2.3.1 Release 22092015 (ddmm)
 Release Report - http://s.apache.org/nutch_2.3.1
 
+* NUTCH-2168 Parse-tika fails to retrieve parser (snagel, Auro Miralles, 
lewismc)
+
 * NUTCH-2169 Integrate index-html into Nutch build (snagel)
 
 * NUTCH-2143 GeneratorJob ignores batch id passed as argument (liuqibj, 
lewismc, snagel)

Modified: 
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1723851=1723850=1723851=diff
==
--- 
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 Sat Jan  9 13:01:31 2016
@@ -207,7 +207,7 @@ public class TikaParser implements org.a
 this.tikaConfig = null;
 
 try {
-  tikaConfig = TikaConfig.getDefaultConfig();
+  tikaConfig = new TikaConfig(this.getClass().getClassLoader());
 } catch (Exception e2) {
   String message = "Problem loading default Tika configuration";
   LOG.error(message, e2);




svn commit: r1723626 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/crawl/GeneratorJob.java

2016-01-07 Thread snagel
Author: snagel
Date: Thu Jan  7 20:57:13 2016
New Revision: 1723626

URL: http://svn.apache.org/viewvc?rev=1723626=rev
Log:
NUTCH-2143 GeneratorJob ignores batch id passed as argument

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1723626=1723625=1723626=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Jan  7 20:57:13 2016
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 2.3.1 Release 22092015 (ddmm)
 Release Report - http://s.apache.org/nutch_2.3.1
 
+* NUTCH-2143 GeneratorJob ignores batch id passed as argument (liuqibj, 
lewismc, snagel)
+
 * NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
 
 * NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1723626=1723625=1723626=diff
==
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Thu 
Jan  7 20:57:13 2016
@@ -163,17 +163,20 @@ public class GeneratorJob extends NutchT
 return fields;
   }
 
+  /** Generate a random batch id */
+  public static String randomBatchId() {
+long curTime = System.currentTimeMillis();
+int randomSeed = Math.abs(new Random().nextInt());
+String batchId = (curTime / 1000) + "-" + randomSeed;
+return batchId;
+  }
+  
   public Map<String, Object> run(Map<String, Object> args) throws Exception {
 String batchId = (String) args.get(Nutch.ARG_BATCH);
-if (batchId != null) {
-  getConf().set(GeneratorJob.BATCH_ID, batchId);
-} else {
-  // generate batchId
-  long curTime = System.currentTimeMillis();
-  int randomSeed = Math.abs(new Random().nextInt());
-  batchId = (curTime / 1000) + "-" + randomSeed;
-  getConf().set(BATCH_ID, batchId);
+if (batchId == null) {
+  batchId = randomBatchId();
 }
+getConf().set(BATCH_ID, batchId);
 
 // map to inverted subset due for fetch, sort by score
 Long topN = null;
@@ -249,10 +252,15 @@ public class GeneratorJob extends NutchT
 if (topN != Long.MAX_VALUE) {
   LOG.info("GeneratorJob: topN: " + topN);
 }
+String batchId = getConf().get(BATCH_ID);
 Map<String, Object> results = run(ToolUtil.toArgMap(Nutch.ARG_TOPN, topN,
 Nutch.ARG_CURTIME, curTime, Nutch.ARG_FILTER, filter,
-Nutch.ARG_NORMALIZE, norm));
-String batchId = getConf().get(BATCH_ID);
+Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId));
+if (batchId == null) {
+  // use generated random batch id
+  batchId = (String) results.get(BATCH_ID);
+}
+
 long finish = System.currentTimeMillis();
 long generateCount = (Long) results.get(GENERATE_COUNT);
 LOG.info("GeneratorJob: finished at " + sdf.format(finish)
@@ -290,11 +298,6 @@ public class GeneratorJob extends NutchT
 long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
 boolean filter = true, norm = true;
 
-// generate batchId
-int randomSeed = Math.abs(new Random().nextInt());
-String batchId = (curTime / 1000) + "-" + randomSeed;
-getConf().set(BATCH_ID, batchId);
-
 for (int i = 0; i < args.length; i++) {
   if ("-topN".equals(args[i])) {
 topN = Long.parseLong(args[++i]);
@@ -307,9 +310,9 @@ public class GeneratorJob extends NutchT
   } else if ("-adddays".equals(args[i])) {
 long numDays = Integer.parseInt(args[++i]);
 curTime += numDays * 1000L * 60 * 60 * 24;
-  } else if ("-batchId".equals(args[i]))
+  } else if ("-batchId".equals(args[i])) {
 getConf().set(BATCH_ID, args[++i]);
-  else {
+  } else {
 System.err.println("Unrecognized arg " + args[i]);
 return -1;
   }




svn commit: r1716177 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml

2015-11-24 Thread snagel
Author: snagel
Date: Tue Nov 24 15:37:32 2015
New Revision: 1716177

URL: http://svn.apache.org/viewvc?rev=1716177=rev
Log:
NUTCH-2175 Typos in property descriptions

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1716177=1716176=1716177=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Nov 24 15:37:32 2015
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 1.11 Release 25/10/2015 (dd/mm/)
 Release Report: http://s.apache.org/nutch11
 
+* NUTCH-2175 Typos in property descriptions in nutch-default.xml (Roannel 
Fernández Hernández via snagel)
+
 * NUTCH-2069 Ignore external links based on domain (jnioche)
 
 * NUTCH-2173 String.join in FileDumper breaks the build (joyce)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1716177=1716176=1716177=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Nov 24 15:37:32 2015
@@ -51,7 +51,7 @@
   true
   The crawler is not restricted to the directories that you 
specified in the
 Urls file but it is jumping into the parent directories as well. For your 
own crawlings you can
-change this bahavior (set to false) the way that only directories beneath 
the directories that you specify get
+change this behavior (set to false) the way that only directories beneath 
the directories that you specify get
 crawled.
 
 
@@ -209,7 +209,7 @@
   100
   The number of times a thread will delay when trying to
   fetch a page.  Each time it finds that a host is busy, it will wait
-  fetcher.server.delay.  After http.max.delays attepts, it will give
+  fetcher.server.delay.  After http.max.delays attempts, it will give
   up on the page for now.
 
 
@@ -752,7 +752,7 @@
   5.0
   The number of seconds the fetcher will delay between 
successive requests to the same server. Note that this might get
-   overriden by a Crawl-Delay from a robots.txt and is used ONLY if 
+   overridden by a Crawl-Delay from a robots.txt and is used ONLY if 
fetcher.threads.per.queue is set to 1.

 
@@ -1102,8 +1102,8 @@
   plugin.auto-activation
   true
   Defines if some plugins that are not activated regarding
-  the plugin.includes and plugin.excludes properties must be automaticaly
-  activated if they are needed by some actived plugins.
+  the plugin.includes and plugin.excludes properties must be automatically
+  activated if they are needed by some active plugins.
   
 
 
@@ -1218,14 +1218,13 @@
   parsefilter.naivebayes.trainfile
   naivebayes-train.txt
   Set the name of the file to be used for Naive Bayes training. 
The format will be: 
-Each line contains two tab seperted parts
+Each line contains two tab separated parts
 There are two columns/parts:
-1. "1" or "0", "1" for relevent and "0" for irrelevent document.
-3. Text (text that will be used for training)
+1. "1" or "0", "1" for relevant and "0" for irrelevant documents.
+2. Text (text that will be used for training)
 
 Each row will be considered a new "document" for the classifier.
 CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using 
this classifier.
-
   
 
 
@@ -1272,7 +1271,7 @@ CAUTION: Set the parser.timeout to -1 or
   tika.htmlmapper.classname
   org.apache.tika.parser.html.IdentityHtmlMapper
   Classname of Tika HTMLMapper to use. Influences the elements 
included in the DOM and hence
-  the behaviour of the HTMLParseFilters.
+  the behavior of the HTMLParseFilters.
   
 
 -->
@@ -1360,7 +1359,7 @@ CAUTION: Set the parser.timeout to -1 or
   scoring.depth.max
   1000
   Max depth value from seed allowed by default.
-  Can be overriden on a per-seed basis by specifying "_maxdepth_=VALUE"
+  Can be overridden on a per-seed basis by specifying "_maxdepth_=VALUE"
   as a seed metadata. This plugin adds a "_depth_" metadatum to the pages
   to track the distance from the seed it was found from. 
   The depth is used to prioritise URLs in the generation step so that
@@ -1373,7 +1372,7 @@ CAUTION: Set the parser.timeout to -1 or
 
   lang.analyze.max.length
   2048
-   The maximum bytes of data to uses to indentify
+   The maximum number of bytes used to identify
   the language (0 means full content analysis).
   The larger is this value, the better is the analysis, but the
   slowest it is.
@@ -1667,7 +1666,7 @@ CAUTION: Set the parser.timeout to -1 or
   solr.loadbalance.urls
   
   
-  A comma-seperated value representing the Solr servers to be used when
+  A comma-separated value representing the Solr servers to be used when
   initi

nutch git commit: NUTCH-2272 Index checker server to optionally keep client connection open - removed from change log for release 1.12 as it is not included

2016-06-23 Thread snagel
Repository: nutch
Updated Branches:
  refs/heads/master af6d8763f -> d29be63bd


NUTCH-2272 Index checker server to optionally keep client connection open
- removed from change log for release 1.12 as it is not included


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/d29be63b
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/d29be63b
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/d29be63b

Branch: refs/heads/master
Commit: d29be63bd44cfcaf7e0a1e340160df8a0ba2b600
Parents: af6d876
Author: Sebastian Nagel 
Authored: Thu Jun 23 17:09:19 2016 +0200
Committer: Sebastian Nagel 
Committed: Thu Jun 23 17:09:19 2016 +0200

--
 CHANGES.txt | 1 -
 1 file changed, 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/d29be63b/CHANGES.txt
--
diff --git a/CHANGES.txt b/CHANGES.txt
index 877f23b..ffcf5ae 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -37,7 +37,6 @@ Bug
 
 Improvement
 
-[NUTCH-2272] - Index checker server to optionally keep client connection 
open
 [NUTCH-1233] - Rely on Tika for outlink extraction
 [NUTCH-1712] - Use MultipleInputs in Injector to make it a single 
mapreduce job
 [NUTCH-2172] - index-more: document format of contenttype-mapping.txt



[5/5] nutch git commit: fix unit test: CrawlDbFilter stil writes reduce output dirs as part-00000 (not part-r-00000)

2016-02-25 Thread snagel
fix unit test: CrawlDbFilter stil writes reduce output dirs as part-0 (not 
part-r-0)


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f5e430e5
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f5e430e5
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f5e430e5

Branch: refs/heads/master
Commit: f5e430e557cc3768261ab86617b1b1589e120d92
Parents: 756f2a1
Author: Sebastian Nagel 
Authored: Thu Feb 25 22:37:47 2016 +0100
Committer: Sebastian Nagel 
Committed: Thu Feb 25 22:37:47 2016 +0100

--
 src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/f5e430e5/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
--
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java 
b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
index 5c38037..38c38ed 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
@@ -106,7 +106,7 @@ public class TestCrawlDbFilter {
 job.setOutputValueClass(CrawlDatum.class);
 JobClient.runJob(job);
 
-Path fetchlist = new Path(new Path(newCrawlDb, "part-r-0"), "data");
+Path fetchlist = new Path(new Path(newCrawlDb, "part-0"), "data");
 
 ArrayList l = readContents(fetchlist);
 



[1/5] nutch git commit: update tests to reflect change of reduce outputs by new API (part-nnnnn -> part-r-nnnnn): all unit tests pass now

2016-02-25 Thread snagel
Repository: nutch
Updated Branches:
  refs/heads/master 25e879afc -> f5e430e55


update tests to reflect change of reduce outputs by new API (part-n -> 
part-r-n): all unit tests pass now


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/0baca7a9
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/0baca7a9
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/0baca7a9

Branch: refs/heads/master
Commit: 0baca7a966dd1031c80caa5e8e4a3e855c1f358e
Parents: 288dcee
Author: Sebastian Nagel 
Authored: Sun Jan 17 22:20:32 2016 +0100
Committer: Sebastian Nagel 
Committed: Thu Feb 25 21:26:30 2016 +0100

--
 src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java   | 2 +-
 src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java | 2 +-
 src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java | 2 +-
 src/test/org/apache/nutch/crawl/TestInjector.java  | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/0baca7a9/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
--
diff --git a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java 
b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
index 86ba76c..56905e4 100644
--- a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
+++ b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
@@ -60,7 +60,7 @@ public class CrawlDBTestUtil {
 Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
 org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = 
SequenceFile.Writer.valueClass(CrawlDatum.class);
 MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir,
-"part-0"), wKeyOpt, wValueOpt);
+"part-r-0"), wKeyOpt, wValueOpt);
 Iterator it = init.iterator();
 while (it.hasNext()) {
   URLCrawlDatum row = it.next();

http://git-wip-us.apache.org/repos/asf/nutch/blob/0baca7a9/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
--
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java 
b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
index 38c38ed..5c38037 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
@@ -106,7 +106,7 @@ public class TestCrawlDbFilter {
 job.setOutputValueClass(CrawlDatum.class);
 JobClient.runJob(job);
 
-Path fetchlist = new Path(new Path(newCrawlDb, "part-0"), "data");
+Path fetchlist = new Path(new Path(newCrawlDb, "part-r-0"), "data");
 
 ArrayList l = readContents(fetchlist);
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/0baca7a9/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
--
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java 
b/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
index c800610..b670551 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
@@ -149,7 +149,7 @@ public class TestCrawlDbMerger {
 org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = 
SequenceFile.Writer.valueClass(CrawlDatum.class);
 
 MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
-"part-0"), wKeyOpt, wValueOpt);
+"part-r-0"), wKeyOpt, wValueOpt);
 Iterator it = init.iterator();
 while (it.hasNext()) {
   String key = it.next();

http://git-wip-us.apache.org/repos/asf/nutch/blob/0baca7a9/src/test/org/apache/nutch/crawl/TestInjector.java
--
diff --git a/src/test/org/apache/nutch/crawl/TestInjector.java 
b/src/test/org/apache/nutch/crawl/TestInjector.java
index 135f392..7293cbb 100644
--- a/src/test/org/apache/nutch/crawl/TestInjector.java
+++ b/src/test/org/apache/nutch/crawl/TestInjector.java
@@ -141,7 +141,7 @@ public class TestInjector {
 
   private List readCrawldb() throws IOException {
 Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
-+ "/part-0/data");
++ "/part-r-0/data");
 System.out.println("reading:" + dbfile);
 Option rFile = SequenceFile.Reader.file(dbfile);
 @SuppressWarnings("resource")
@@ -161,7 +161,7 @@ public class TestInjector {
 
   private HashMap readCrawldbRecords() throws IOException {
 Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
-+ "/part-0/data");
++ "/part-r-0/data");
 System.out.println("reading:" + dbfile);
 Option rFile = SequenceFile.Reader.file(dbfile);
 

[3/5] nutch git commit: NUTCH-1712 applied to current trunk; run first simple tests (inject + merge)

2016-02-25 Thread snagel
NUTCH-1712 applied to current trunk; run first simple tests (inject + merge)


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/3c691eb2
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/3c691eb2
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/3c691eb2

Branch: refs/heads/master
Commit: 3c691eb2823cb85c9ffe95e9212ce7ac0e564709
Parents: 25e879a
Author: Sebastian Nagel 
Authored: Mon Oct 19 21:48:05 2015 +0200
Committer: Sebastian Nagel 
Committed: Thu Feb 25 21:26:30 2016 +0100

--
 src/java/org/apache/nutch/crawl/CrawlDb.java  |  19 +
 src/java/org/apache/nutch/crawl/Injector.java | 599 -
 2 files changed, 360 insertions(+), 258 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/3c691eb2/src/java/org/apache/nutch/crawl/CrawlDb.java
--
diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java 
b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 053e8fb..1537cdc 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -28,8 +28,10 @@ import org.apache.hadoop.io.*;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.util.*;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.util.FSUtils;
 import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.LockUtil;
 import org.apache.nutch.util.NutchConfiguration;
@@ -173,6 +175,23 @@ public class CrawlDb extends NutchTool implements Tool {
 LockUtil.removeLockFile(fs, lock);
   }
 
+  public static void install(Job job, Path crawlDb) throws IOException {
+Configuration conf = job.getConfiguration();
+boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
+FileSystem fs = FileSystem.get(conf);
+Path old = new Path(crawlDb, "old");
+Path current = new Path(crawlDb, CURRENT_NAME);
+Path tempCrawlDb = org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
+.getOutputPath(job);
+FSUtils.replace(fs, old, current, true);
+FSUtils.replace(fs, current, tempCrawlDb, true);
+Path lock = new Path(crawlDb, LOCK_NAME);
+LockUtil.removeLockFile(fs, lock);
+if (!preserveBackup && fs.exists(old)) {
+  fs.delete(old, true);
+}
+  }
+
   public static void main(String[] args) throws Exception {
 int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDb(), args);
 System.exit(res);

http://git-wip-us.apache.org/repos/asf/nutch/blob/3c691eb2/src/java/org/apache/nutch/crawl/Injector.java
--
diff --git a/src/java/org/apache/nutch/crawl/Injector.java 
b/src/java/org/apache/nutch/crawl/Injector.java
index dc1f1cf..0d01dc8 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -17,211 +17,267 @@
 
 package org.apache.nutch.crawl;
 
-import java.io.*;
-import java.text.SimpleDateFormat;
-import java.util.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
 
-// Commons Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.io.*;
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.conf.*;
-import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.*;
-import org.apache.nutch.net.*;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.LockUtil;
 import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TimingUtil;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import 

[4/5] nutch git commit: NUTCH-1712 Use MultipleInputs in Injector to make it a single mapreduce job, this closes #86

2016-02-25 Thread snagel
NUTCH-1712 Use MultipleInputs in Injector to make it a single mapreduce job, 
this closes #86


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/756f2a1c
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/756f2a1c
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/756f2a1c

Branch: refs/heads/master
Commit: 756f2a1c88d638f21515ec472088d8f504d12d44
Parents: 0baca7a
Author: Sebastian Nagel <sna...@apache.org>
Authored: Thu Feb 25 22:04:14 2016 +0100
Committer: Sebastian Nagel <sna...@apache.org>
Committed: Thu Feb 25 22:24:45 2016 +0100

--
 CHANGES.txt | 2 ++
 1 file changed, 2 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/756f2a1c/CHANGES.txt
--
diff --git a/CHANGES.txt b/CHANGES.txt
index 71647ee..9b3895c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ in the release announcement and keep it on top in this 
CHANGES.txt for the Nutch
 
 Nutch Change Log
 
+* NUTCH-1712 Use MultipleInputs in Injector to make it a single mapreduce job 
(tejasp, snagel)
+
 * NUTCH-2231 Jexl support in generator job (markus)
 
 * NUTCH-2232 DeduplicationJob should decode URL's before length is compared 
(Ron van der Vegt via markus)



[2/5] nutch git commit: add unit tests based on MRUnit

2016-02-25 Thread snagel
add unit tests based on MRUnit


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/288dceed
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/288dceed
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/288dceed

Branch: refs/heads/master
Commit: 288dceedb7de28457878eecb03a571d082a48cc2
Parents: 3c691eb
Author: Sebastian Nagel 
Authored: Sun Jan 17 21:32:31 2016 +0100
Committer: Sebastian Nagel 
Committed: Thu Feb 25 21:26:30 2016 +0100

--
 ivy/ivy.xml |  10 +-
 ivy/ivysettings.xml |   2 +-
 src/java/org/apache/nutch/crawl/Injector.java   |   7 +-
 .../nutch/crawl/CrawlDbUpdateTestDriver.java| 138 +++
 .../apache/nutch/crawl/TestCrawlDbStates.java   |   7 +-
 .../org/apache/nutch/crawl/TestInjector.java|   3 +-
 .../org/apache/nutch/fetcher/TestFetcher.java   |   2 +-
 7 files changed, 156 insertions(+), 13 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/ivy/ivy.xml
--
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 206cce7..bc8d293 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -11,7 +11,7 @@
OF ANY KIND, either express or implied. See the License for the 
specific 
language governing permissions and limitations under the License. -->
 
-
+http://ant.apache.org/ivy/maven;>

http://www.apache.org/licenses/LICENSE-2.0.txt/; />
@@ -98,6 +98,10 @@
 


+   
+   
+   
+   



@@ -125,9 +129,7 @@



-
-
-
+   
 

 

http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/ivy/ivysettings.xml
--
diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml
index 0319333..d9b5044 100644
--- a/ivy/ivysettings.xml
+++ b/ivy/ivysettings.xml
@@ -35,7 +35,7 @@
 value="https://repository.apache.org/content/repositories/snapshots/;
 override="false"/>
   
+
value="[organisation]/[module]/[revision]/[module]-[revision](-[classifier])"/>
   
   

http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/java/org/apache/nutch/crawl/Injector.java
--
diff --git a/src/java/org/apache/nutch/crawl/Injector.java 
b/src/java/org/apache/nutch/crawl/Injector.java
index 0d01dc8..383aaf1 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -319,12 +319,13 @@ public class Injector extends NutchTool implements Tool {
 setConf(conf);
   }
 
-  public void inject(Path crawlDb, Path urlDir) throws Exception {
+  public void inject(Path crawlDb, Path urlDir)
+  throws IOException, ClassNotFoundException, InterruptedException {
 inject(crawlDb, urlDir, false, false);
   }
 
   public void inject(Path crawlDb, Path urlDir, boolean overwrite,
-  boolean update) throws Exception {
+  boolean update) throws IOException, ClassNotFoundException, 
InterruptedException {
 SimpleDateFormat sdf = new SimpleDateFormat("-MM-dd HH:mm:ss");
 long start = System.currentTimeMillis();
 
@@ -397,7 +398,7 @@ public class Injector extends NutchTool implements Tool {
 LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
 + TimingUtil.elapsedTime(start, end));
   }
-} catch (Exception e) {
+} catch (IOException e) {
   if (fs.exists(tempCrawlDb)) {
 fs.delete(tempCrawlDb, true);
   }

http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
--
diff --git a/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java 
b/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
new file mode 100644
index 000..7238f88
--- /dev/null
+++ b/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or 

svn commit: r1726314 - in /nutch/trunk: CHANGES.txt conf/regex-normalize.xml.template ivy/ivy.xml

2016-01-22 Thread snagel
Author: snagel
Date: Fri Jan 22 21:26:12 2016
New Revision: 1726314

URL: http://svn.apache.org/viewvc?rev=1726314=rev
Log:
NUTCH-2204 Remove junit lib from runtime

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/regex-normalize.xml.template
nutch/trunk/ivy/ivy.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1726314=1726313=1726314=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 22 21:26:12 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2204 Remove junit lib from runtime (snagel)
+
 * NUTCH-2201 Remove loops program from webgraph package (markus)
 
 * NUTCH-1325 HostDB for Nutch (Gui Forget, markus, tejasp)

Modified: nutch/trunk/conf/regex-normalize.xml.template
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1726314=1726313=1726314=diff
==
--- nutch/trunk/conf/regex-normalize.xml.template (original)
+++ nutch/trunk/conf/regex-normalize.xml.template Fri Jan 22 21:26:12 2016
@@ -39,11 +39,12 @@
   /$3
  -->
 
-
+
 
 
 

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1726314=1726313=1726314=diff
==
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Fri Jan 22 21:26:12 2016
@@ -92,6 +92,7 @@



+   

 





nutch git commit: Inconsistent log level

2016-04-29 Thread snagel
Repository: nutch
Updated Branches:
  refs/heads/master 6d2bfa986 -> 0e03daf11


Inconsistent log level


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/0e03daf1
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/0e03daf1
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/0e03daf1

Branch: refs/heads/master
Commit: 0e03daf1139a1a8465d6e2a6b54490e2dfc2a9ef
Parents: 6d2bfa9
Author: Sebastian Nagel <sna...@apache.org>
Authored: Fri Apr 29 18:33:04 2016 +0200
Committer: Sebastian Nagel <sna...@apache.org>
Committed: Fri Apr 29 18:33:04 2016 +0200

--
 CHANGES.txt  | 2 ++
 src/java/org/apache/nutch/fetcher/FetcherThread.java | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/0e03daf1/CHANGES.txt
--
diff --git a/CHANGES.txt b/CHANGES.txt
index 6173134..436db07 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ in the release announcement and keep it on top in this 
CHANGES.txt for the Nutch
 
 Nutch Change Log
 
+* NUTCH-2256 Inconsistent log level (songwanging via snagel)
+
 * NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 
(Federico Bonelli, snagel)
 
 * NUTCH-2250 CommonCrawlDumper : Invalid format and skipped parts (Thamme 
Gowda N.,lewismc via mattmann)

http://git-wip-us.apache.org/repos/asf/nutch/blob/0e03daf1/src/java/org/apache/nutch/fetcher/FetcherThread.java
--
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 09315a7..e57e735 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -286,7 +286,7 @@ public class FetcherThread extends Thread {
 .getFetchItemQueue(fit.queueID);
 fiq.crawlDelay = rules.getCrawlDelay();
 if (LOG.isDebugEnabled()) {
-  LOG.info("Crawl delay for queue: " + fit.queueID
+  LOG.debug("Crawl delay for queue: " + fit.queueID
   + " is set to " + fiq.crawlDelay
   + " as per robots.txt. url: " + fit.url);
 }



nutch git commit: NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 - generate base64 encoded string directly from content bytes (patch provided by Federico Bonelli) - add JUn

2016-04-27 Thread snagel
Repository: nutch
Updated Branches:
  refs/heads/master 8572fd955 -> 6d2bfa986


NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64
 - generate base64 encoded string directly from content bytes
   (patch provided by Federico Bonelli)
 - add JUnit test to test indexing base64 encoded binary content
   with UTF-8, ISO-8859-1 and ISO-8859-2 character sets


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6d2bfa98
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6d2bfa98
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6d2bfa98

Branch: refs/heads/master
Commit: 6d2bfa98635d8055d56dbe2597efc953f420ed5a
Parents: 8572fd9
Author: Sebastian Nagel <sna...@apache.org>
Authored: Mon Apr 25 14:40:44 2016 +0200
Committer: Sebastian Nagel <sna...@apache.org>
Committed: Wed Apr 27 22:49:47 2016 +0200

--
 CHANGES.txt |   2 +
 .../apache/nutch/indexer/IndexerMapReduce.java  |  12 +-
 .../apache/nutch/indexer/NutchIndexAction.java  |   3 +
 .../nutch/indexer/TestIndexerMapReduce.java | 187 +++
 4 files changed, 198 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/CHANGES.txt
--
diff --git a/CHANGES.txt b/CHANGES.txt
index e14d7c5..6173134 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ in the release announcement and keep it on top in this 
CHANGES.txt for the Nutch
 
 Nutch Change Log
 
+* NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 
(Federico Bonelli, snagel)
+
 * NUTCH-2250 CommonCrawlDumper : Invalid format and skipped parts (Thamme 
Gowda N.,lewismc via mattmann)
 
 * NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine 
Similarity Model (bhavyasanghavi via sujen)

http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
--
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 1d5f66f..5025525 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -350,14 +350,14 @@ public class IndexerMapReduce extends Configured 
implements
 }
 
 if (content != null) {
-  // Get the original unencoded content
-  String binary = new String(content.getContent());
-
-  // optionally encode as base64
+  // Add the original binary content
+  String binary;
   if (base64) {
-binary = Base64.encodeBase64String(StringUtils.getBytesUtf8(binary));
+// optionally encode as base64
+binary = Base64.encodeBase64String(content.getContent());
+  } else {
+binary = new String(content.getContent());
   }
-
   doc.add("binaryContent", binary);
 }
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/java/org/apache/nutch/indexer/NutchIndexAction.java
--
diff --git a/src/java/org/apache/nutch/indexer/NutchIndexAction.java 
b/src/java/org/apache/nutch/indexer/NutchIndexAction.java
index 679d784..b2517c3 100644
--- a/src/java/org/apache/nutch/indexer/NutchIndexAction.java
+++ b/src/java/org/apache/nutch/indexer/NutchIndexAction.java
@@ -37,6 +37,9 @@ public class NutchIndexAction implements Writable {
   public NutchDocument doc = null;
   public byte action = ADD;
 
+  protected NutchIndexAction() {
+  }
+
   public NutchIndexAction(NutchDocument doc, byte action) {
 this.doc = doc;
 this.action = action;

http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java
--
diff --git a/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java 
b/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java
new file mode 100644
index 000..d581a0f
--- /dev/null
+++ b/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java
@@ -0,0 +1,187 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writ

nutch git commit: Inconsistent log level

2016-04-29 Thread snagel
Repository: nutch
Updated Branches:
  refs/heads/2.x 9e7c0e6fa -> 1fc254e5e


Inconsistent log level


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/1fc254e5
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/1fc254e5
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/1fc254e5

Branch: refs/heads/2.x
Commit: 1fc254e5eb68f40f66911ba9854d20c0fea88fc9
Parents: 9e7c0e6
Author: Sebastian Nagel <sna...@apache.org>
Authored: Fri Apr 29 18:46:04 2016 +0200
Committer: Sebastian Nagel <sna...@apache.org>
Committed: Fri Apr 29 18:46:04 2016 +0200

--
 CHANGES.txt   | 2 ++
 src/java/org/apache/nutch/fetcher/FetcherReducer.java | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/1fc254e5/CHANGES.txt
--
diff --git a/CHANGES.txt b/CHANGES.txt
index 0a20a98..b7f1345 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch 2.4 Development
 
+ * NUTCH-2256 Inconsistent log level (songwanging via snagel)
+
  * NUTCH-961 GitHub-92 Add the boilerpipe parsing adapted from NUTCH-961 
(Jeremie Bourseaux <jeremie.bours...@xilopix.com> via mattmann)
 
  * GitHub-94 Fix the issue of the bad timestamp. (Jeremie Bourseaux 
<jeremie.bours...@xilopix.com> via mattmann)

http://git-wip-us.apache.org/repos/asf/nutch/blob/1fc254e5/src/java/org/apache/nutch/fetcher/FetcherReducer.java
--
diff --git a/src/java/org/apache/nutch/fetcher/FetcherReducer.java 
b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
index 00860b6..8ee7477 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherReducer.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
@@ -522,7 +522,7 @@ public class FetcherReducer extends
 .getFetchItemQueue(fit.queueID);
 fiq.crawlDelay = rules.getCrawlDelay();
 if (LOG.isDebugEnabled()) {
-  LOG.info("Crawl delay for queue: " + fit.queueID
+  LOG.debug("Crawl delay for queue: " + fit.queueID
   + " is set to " + fiq.crawlDelay
   + " as per robots.txt. url: " + fit.url);
 }



nutch git commit: fix for NUTCH-2191 - fixing Nutch build - contributed by karanjeets

2016-04-18 Thread snagel
Repository: nutch
Updated Branches:
  refs/heads/master 044e8e77e -> 8572fd955


fix for NUTCH-2191 - fixing Nutch build - contributed by karanjeets


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/8572fd95
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/8572fd95
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/8572fd95

Branch: refs/heads/master
Commit: 8572fd9551b430f31a4fdace14738f2d9959b370
Parents: 044e8e7
Author: Karanjeet Singh 
Authored: Mon Apr 18 00:45:37 2016 -0700
Committer: Karanjeet Singh 
Committed: Mon Apr 18 00:45:37 2016 -0700

--
 src/plugin/protocol-htmlunit/build.xml  |   9 -
 .../nutch/protocol/htmlunit/HttpResponse.java   | 408 ++-
 2 files changed, 317 insertions(+), 100 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/8572fd95/src/plugin/protocol-htmlunit/build.xml
--
diff --git a/src/plugin/protocol-htmlunit/build.xml 
b/src/plugin/protocol-htmlunit/build.xml
index bf695fe..899214c 100644
--- a/src/plugin/protocol-htmlunit/build.xml
+++ b/src/plugin/protocol-htmlunit/build.xml
@@ -34,13 +34,4 @@
 
   
 
-  
-  
-
-
-
-  
-
-  
-
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/8572fd95/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
--
diff --git 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 7242f40..8b1a031 100644
--- 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -20,11 +20,18 @@ import java.io.BufferedInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.EOFException;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.PushbackInputStream;
 import java.net.InetSocketAddress;
 import java.net.Socket;
 import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.net.ssl.SSLSocket;
+import javax.net.ssl.SSLSocketFactory;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -35,46 +42,78 @@ import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.nutch.protocol.http.api.HttpException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
+/**
+ * An HTTP response.
+ */
 public class HttpResponse implements Response {
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(HttpResponse.class);
-
-  private Http http;
+  private Configuration conf;
+  private HttpBase http;
   private URL url;
+  private String orig;
+  private String base;
   private byte[] content;
   private int code;
   private Metadata headers = new SpellCheckedMetadata();
+  // used for storing the http headers verbatim
+  private StringBuffer httpHeaders;
 
-  /** The nutch configuration */
-  private Configuration conf = null;
+  protected enum Scheme {
+HTTP, HTTPS,
+  }
 
-  public HttpResponse(Http http, URL url, CrawlDatum datum) throws 
ProtocolException, IOException {
+  /**
+   * Default public constructor.
+   *
+   * @param http
+   * @param url
+   * @param datum
+   * @throws ProtocolException
+   * @throws IOException
+   */
+  public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
+  throws ProtocolException, IOException {
 
-this.conf = http.getConf();
 this.http = http;
 this.url = url;
+this.orig = url.toString();
+this.base = url.toString();
+
+Scheme scheme = null;
+
+if ("http".equals(url.getProtocol())) {
+  scheme = Scheme.HTTP;
+} else if ("https".equals(url.getProtocol())) {
+  scheme = Scheme.HTTPS;
+} else {
+  throw new HttpException("Unknown scheme (not http/https) for url:" + 
url);
+}
+
+if (Http.LOG.isTraceEnabled()) {
+  Http.LOG.trace("fetching " + url);
+}
 
-LOG.info("fetching {}", url);
-
 String path = "".equals(url.getFile()) ? "/" : url.getFile();
 
 // some servers will redirect a request with a host line like
 // "Host: :80" to "http:///"- they
 // don't want the :80...
+
 String host = url.getHost();
 int port;
 String portString;
 if (url.getPort() == -1) {
-  port = 80;
+  if (scheme == Scheme.HTTP) {
+port = 80;
+  } else {
+

[2/2] nutch git commit: NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html - fix broken unit test (fix HTML markup, make test for meta data extraction obligatory) -

2016-07-01 Thread snagel
NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using 
parser-html
- fix broken unit test (fix HTML markup, make test for meta data extraction 
obligatory)
- add all values of general metadata to parse metadata


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/34050ada
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/34050ada
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/34050ada

Branch: refs/heads/master
Commit: 34050adae0896a6d7ddb254a1622a03af6e07175
Parents: c18e19b
Author: Sebastian Nagel 
Authored: Fri Jul 1 15:07:52 2016 +0200
Committer: Sebastian Nagel 
Committed: Fri Jul 1 15:10:49 2016 +0200

--
 .../org/apache/nutch/metadata/Metadata.java | 25 
 .../org/apache/nutch/parse/html/HtmlParser.java |  4 +---
 .../apache/nutch/parse/html/TestHtmlParser.java | 11 -
 3 files changed, 31 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/java/org/apache/nutch/metadata/Metadata.java
--
diff --git a/src/java/org/apache/nutch/metadata/Metadata.java 
b/src/java/org/apache/nutch/metadata/Metadata.java
index f0bfcd3..8a57ee3 100644
--- a/src/java/org/apache/nutch/metadata/Metadata.java
+++ b/src/java/org/apache/nutch/metadata/Metadata.java
@@ -123,6 +123,31 @@ public class Metadata implements Writable, 
CreativeCommons, DublinCore,
   }
 
   /**
+   * Add all name/value mappings (merge two metadata mappings). If a name
+   * already exists in current metadata the values are added to existing 
values.
+   *
+   * @param metadata
+   *  other Metadata to be merged
+   */
+  public void addAll(Metadata metadata) {
+for (String name : metadata.names()) {
+  String[] addValues = metadata.getValues(name);
+  if (addValues == null)
+continue;
+  String[] oldValues = this.metadata.get(name);
+  if (oldValues == null) {
+this.metadata.put(name, addValues);
+  } else {
+String[] newValues = new String[oldValues.length + addValues.length];
+System.arraycopy(oldValues, 0, newValues, 0, oldValues.length);
+System.arraycopy(addValues, 0, newValues, oldValues.length,
+addValues.length);
+this.metadata.put(name, newValues);
+  }
+}
+  }
+
+  /**
* Copy All key-value pairs from properties.
* 
* @param properties

http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
--
diff --git 
a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java 
b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index baa..4d043ba 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -183,9 +183,7 @@ public class HtmlParser implements Parser {
 HTMLMetaProcessor.getMetaTags(metaTags, root, base);
 
 // populate Nutch metadata with HTML meta directives
-for (String name : metaTags.getGeneralTags().names()) {
-  metadata.add(name, metaTags.getGeneralTags().get(name));
-}
+metadata.addAll(metaTags.getGeneralTags());
 
 if (LOG.isTraceEnabled()) {
   LOG.trace("Meta tags for " + base + ": " + metaTags.toString());

http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
--
diff --git 
a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
 
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index bcfe9e4..7099f50 100644
--- 
a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ 
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -40,8 +40,8 @@ public class TestHtmlParser {
   private static final String encodingTestBody = "\n  français\n  
español\n  русский язык\n  čeština\n  
ελληνικά\n";
   private static final String encodingTestContent = ""
   + encodingTestKeywords + "\n"
-  + "\n" + "\n" + encodingTestBody + 
"\n";
+  + "\n"
+  + "\n" + encodingTestBody + "\n";
 
   private static String[][] encodingTestPages = {
   {
@@ -113,10 +113,9 @@ public class TestHtmlParser {
 Assert.assertTrue(keyword + " not found in text (" + name + ")",
 text.contains(keyword));
   }
-  if (keywords != null) {
-Assert.assertEquals("Keywords not 

[1/2] nutch git commit: NUTCH-2291 - Fix mrunit dependencies - remove classifier from dependency because pom file name on Maven repository does not contain a classifier

2016-07-01 Thread snagel
Repository: nutch
Updated Branches:
  refs/heads/master cb6fbae51 -> 34050adae


NUTCH-2291 - Fix mrunit dependencies
- remove classifier from dependency because pom file name on Maven repository 
does not contain a classifier


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/c18e19bf
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/c18e19bf
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/c18e19bf

Branch: refs/heads/master
Commit: c18e19bfe63c3ac5221d1a0f454b9e1a037a4386
Parents: cb6fbae
Author: Sebastian Nagel 
Authored: Fri Jul 1 14:45:41 2016 +0200
Committer: Sebastian Nagel 
Committed: Fri Jul 1 14:45:41 2016 +0200

--
 ivy/ivy.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/c18e19bf/ivy/ivy.xml
--
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index a4e9481..a9a83ae 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -97,7 +97,7 @@
 


-   
+   






[3/4] nutch git commit: CrawlDb statistics: add fetch time (earliest, latest, average)

2016-07-02 Thread snagel
CrawlDb statistics: add fetch time (earliest, latest, average)


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/ea2843b9
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/ea2843b9
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/ea2843b9

Branch: refs/heads/master
Commit: ea2843b9be6569e17963031d7370f5db42261809
Parents: 6b141fb
Author: Sebastian Nagel 
Authored: Mon Jun 20 14:42:04 2016 +0200
Committer: Sebastian Nagel 
Committed: Sat Jul 2 12:06:04 2016 +0200

--
 .../org/apache/nutch/crawl/CrawlDbReader.java   | 76 
 1 file changed, 46 insertions(+), 30 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/ea2843b9/src/java/org/apache/nutch/crawl/CrawlDbReader.java
--
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 8f42ac4..381cec5 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -197,6 +197,7 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
   .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1);
   output.collect(new Text("s"), new LongWritable(
   (long) (value.getScore() * 1000.0)));
+  output.collect(new Text("f"), new LongWritable(value.getFetchTime()));
   if (sort) {
 URL u = new URL(key.toString());
 String host = u.getHost();
@@ -219,32 +220,40 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
 public void close() {
 }
 
+private void reduceMinMaxTotal(String keyPrefix, Iterator 
values,
+OutputCollector output, Reporter reporter)
+throws IOException {
+  long total = 0;
+  long min = Long.MAX_VALUE;
+  long max = Long.MIN_VALUE;
+  while (values.hasNext()) {
+LongWritable cnt = values.next();
+if (cnt.get() < min)
+  min = cnt.get();
+if (cnt.get() > max)
+  max = cnt.get();
+total += cnt.get();
+  }
+  output.collect(new Text(keyPrefix+"n"), new LongWritable(min));
+  output.collect(new Text(keyPrefix+"x"), new LongWritable(max));
+  output.collect(new Text(keyPrefix+"t"), new LongWritable(total));
+}
+
 public void reduce(Text key, Iterator values,
 OutputCollector output, Reporter reporter)
 throws IOException {
   val.set(0L);
   String k = key.toString();
-  if (!k.equals("s")) {
+  if (k.equals("s")) {
+reduceMinMaxTotal("sc", values, output, reporter);
+  } else if (k.equals("f")) {
+reduceMinMaxTotal("ft", values, output, reporter);
+  } else {
 while (values.hasNext()) {
   LongWritable cnt = values.next();
   val.set(val.get() + cnt.get());
 }
 output.collect(key, val);
-  } else {
-long total = 0;
-long min = Long.MAX_VALUE;
-long max = Long.MIN_VALUE;
-while (values.hasNext()) {
-  LongWritable cnt = values.next();
-  if (cnt.get() < min)
-min = cnt.get();
-  if (cnt.get() > max)
-max = cnt.get();
-  total += cnt.get();
-}
-output.collect(new Text("scn"), new LongWritable(min));
-output.collect(new Text("scx"), new LongWritable(max));
-output.collect(new Text("sct"), new LongWritable(total));
   }
 }
   }
@@ -277,7 +286,7 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
   cnt.set(cnt.get() + val.get());
 }
 output.collect(key, cnt);
-  } else if (k.equals("scx")) {
+  } else if (k.equals("scx") || k.equals("ftx")) {
 LongWritable cnt = new LongWritable(Long.MIN_VALUE);
 while (values.hasNext()) {
   LongWritable val = values.next();
@@ -285,7 +294,7 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
 cnt.set(val.get());
 }
 output.collect(key, cnt);
-  } else if (k.equals("scn")) {
+  } else if (k.equals("scn") || k.equals("ftn")) {
 LongWritable cnt = new LongWritable(Long.MAX_VALUE);
 while (values.hasNext()) {
   LongWritable val = values.next();
@@ -293,7 +302,7 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
 cnt.set(val.get());
 }
 output.collect(key, cnt);
-  } else if (k.equals("sct")) {
+  } else if (k.equals("sct") || k.equals("ftt")) {
 LongWritable cnt = new LongWritable();
 while (values.hasNext()) {
   

[2/4] nutch git commit: CrawlDb statistics: add fetch interval (shortest, longest, average)

2016-07-02 Thread snagel
CrawlDb statistics: add fetch interval (shortest, longest, average)


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/39f6c713
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/39f6c713
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/39f6c713

Branch: refs/heads/master
Commit: 39f6c713974240d19d54a515cd04372878739456
Parents: ea2843b
Author: Sebastian Nagel 
Authored: Wed Jun 22 16:22:33 2016 +0200
Committer: Sebastian Nagel 
Committed: Sat Jul 2 12:06:04 2016 +0200

--
 .../org/apache/nutch/crawl/CrawlDbReader.java   | 35 -
 src/java/org/apache/nutch/util/TimingUtil.java  | 53 
 2 files changed, 55 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/39f6c713/src/java/org/apache/nutch/crawl/CrawlDbReader.java
--
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 381cec5..3cf6ff3 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -69,6 +69,7 @@ import org.apache.nutch.util.JexlUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.TimingUtil;
 import org.apache.commons.jexl2.Expression;
 import org.apache.commons.jexl2.JexlEngine;
 import org.apache.commons.lang.time.DateUtils;
@@ -195,9 +196,10 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
   output.collect(new Text("status " + value.getStatus()), COUNT_1);
   output
   .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1);
-  output.collect(new Text("s"), new LongWritable(
+  output.collect(new Text("sc"), new LongWritable(
   (long) (value.getScore() * 1000.0)));
-  output.collect(new Text("f"), new LongWritable(value.getFetchTime()));
+  output.collect(new Text("ft"), new LongWritable(value.getFetchTime()));
+  output.collect(new Text("fi"), new 
LongWritable(value.getFetchInterval()));
   if (sort) {
 URL u = new URL(key.toString());
 String host = u.getHost();
@@ -244,10 +246,8 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
 throws IOException {
   val.set(0L);
   String k = key.toString();
-  if (k.equals("s")) {
-reduceMinMaxTotal("sc", values, output, reporter);
-  } else if (k.equals("f")) {
-reduceMinMaxTotal("ft", values, output, reporter);
+  if (k.equals("sc") || k.equals("ft") || k.equals("fi")) {
+reduceMinMaxTotal(k, values, output, reporter);
   } else {
 while (values.hasNext()) {
   LongWritable cnt = values.next();
@@ -286,7 +286,7 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
   cnt.set(cnt.get() + val.get());
 }
 output.collect(key, cnt);
-  } else if (k.equals("scx") || k.equals("ftx")) {
+  } else if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) {
 LongWritable cnt = new LongWritable(Long.MIN_VALUE);
 while (values.hasNext()) {
   LongWritable val = values.next();
@@ -294,7 +294,7 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
 cnt.set(val.get());
 }
 output.collect(key, cnt);
-  } else if (k.equals("scn") || k.equals("ftn")) {
+  } else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) {
 LongWritable cnt = new LongWritable(Long.MAX_VALUE);
 while (values.hasNext()) {
   LongWritable val = values.next();
@@ -302,7 +302,7 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
 cnt.set(val.get());
 }
 output.collect(key, cnt);
-  } else if (k.equals("sct") || k.equals("ftt")) {
+  } else if (k.equals("sct") || k.equals("ftt") || k.equals("fit")) {
 LongWritable cnt = new LongWritable();
 while (values.hasNext()) {
   LongWritable val = values.next();
@@ -402,16 +402,16 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
  LongWritable val = stats.get(k);
  if (val == null) {
  val = new LongWritable();
- if (k.equals("scx") || k.equals("ftx"))
+ if (k.equals("scx") || k.equals("ftx") || 
k.equals("fix"))
  val.set(Long.MIN_VALUE);
- if (k.equals("scn") || 

[1/2] nutch git commit: Remove obsolete properties protocol.plugin.check.blocking and protocol.plugin.check.robots

2016-08-16 Thread snagel
Repository: nutch
Updated Branches:
  refs/heads/master d27c351f4 -> d37b7ce13


Remove obsolete properties protocol.plugin.check.blocking and 
protocol.plugin.check.robots


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/070a637b
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/070a637b
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/070a637b

Branch: refs/heads/master
Commit: 070a637babedc324948c0c58b333668bab6b813d
Parents: d27c351
Author: Sebastian Nagel 
Authored: Mon Aug 15 11:19:46 2016 +0200
Committer: Sebastian Nagel 
Committed: Mon Aug 15 11:19:46 2016 +0200

--
 src/java/org/apache/nutch/fetcher/Fetcher.java   |  4 
 src/java/org/apache/nutch/protocol/Protocol.java | 18 --
 2 files changed, 22 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/070a637b/src/java/org/apache/nutch/fetcher/Fetcher.java
--
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java 
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index aad9ee9..e60b10f 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -209,10 +209,6 @@ MapRunnable {
   feeder.setTimeLimit(timelimit);
 feeder.start();
 
-// set non-blocking & no-robots mode for HTTP protocol plugins.
-getConf().setBoolean(Protocol.CHECK_BLOCKING, false);
-getConf().setBoolean(Protocol.CHECK_ROBOTS, false);
-
 for (int i = 0; i < threadCount; i++) { // spawn threads
   FetcherThread t = new FetcherThread(getConf(), getActiveThreads(), 
fetchQueues, 
   feeder, spinWaiting, lastRequestStart, reporter, errors, segmentName,

http://git-wip-us.apache.org/repos/asf/nutch/blob/070a637b/src/java/org/apache/nutch/protocol/Protocol.java
--
diff --git a/src/java/org/apache/nutch/protocol/Protocol.java 
b/src/java/org/apache/nutch/protocol/Protocol.java
index 0aa5d29..efd0100 100755
--- a/src/java/org/apache/nutch/protocol/Protocol.java
+++ b/src/java/org/apache/nutch/protocol/Protocol.java
@@ -33,24 +33,6 @@ public interface Protocol extends Pluggable, Configurable {
   public final static String X_POINT_ID = Protocol.class.getName();
 
   /**
-   * Property name. If in the current configuration this property is set to
-   * true, protocol implementations should handle "politeness" limits
-   * internally. If this is set to false, it is assumed that these limits are
-   * enforced elsewhere, and protocol implementations should not enforce them
-   * internally.
-   */
-  public final static String CHECK_BLOCKING = "protocol.plugin.check.blocking";
-
-  /**
-   * Property name. If in the current configuration this property is set to
-   * true, protocol implementations should handle robot exclusion rules
-   * internally. If this is set to false, it is assumed that these limits are
-   * enforced elsewhere, and protocol implementations should not enforce them
-   * internally.
-   */
-  public final static String CHECK_ROBOTS = "protocol.plugin.check.robots";
-
-  /**
* Returns the {@link Content} for a fetchlist entry.
*/
   ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum);



[2/2] nutch git commit: Merge branch 'NUTCH-2299' of https://github.com/sebastian-nagel/nutch this closes #140 - Remove obsolete properties protocol.plugin.check.*

2016-08-16 Thread snagel
Merge branch 'NUTCH-2299' of https://github.com/sebastian-nagel/nutch this 
closes #140
- Remove obsolete properties protocol.plugin.check.*


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/d37b7ce1
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/d37b7ce1
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/d37b7ce1

Branch: refs/heads/master
Commit: d37b7ce13ee82f0a7d1388f87c2be5d636e425aa
Parents: d27c351 070a637
Author: Sebastian Nagel 
Authored: Tue Aug 16 20:43:01 2016 +0200
Committer: Sebastian Nagel 
Committed: Tue Aug 16 20:43:01 2016 +0200

--
 src/java/org/apache/nutch/fetcher/Fetcher.java   |  4 
 src/java/org/apache/nutch/protocol/Protocol.java | 18 --
 2 files changed, 22 deletions(-)
--




nutch git commit: NUTCH-2349 urlnormalizer-basic: NPE for URLs without authority - check whether URL.getAuthority() returns null - recompose URLs without authority with empty authority/host

2017-02-01 Thread snagel
Repository: nutch
Updated Branches:
  refs/heads/2.x 022ed5c03 -> 700857d16


NUTCH-2349 urlnormalizer-basic: NPE for URLs without authority
- check whether URL.getAuthority() returns null
- recompose URLs without authority with empty authority/host


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/700857d1
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/700857d1
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/700857d1

Branch: refs/heads/2.x
Commit: 700857d16c9e1517ddb9868ed41171d91e5c9116
Parents: 022ed5c
Author: Sebastian Nagel 
Authored: Wed Feb 1 11:51:04 2017 +0100
Committer: Sebastian Nagel 
Committed: Wed Feb 1 11:51:04 2017 +0100

--
 .../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java  | 5 -
 .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java  | 6 ++
 2 files changed, 10 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/700857d1/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
--
diff --git 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index e17b19a..15a1de0 100644
--- 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -79,7 +79,7 @@ public class BasicURLNormalizer extends Configured implements 
URLNormalizer {
 if ("http".equals(protocol) || "https".equals(protocol)
 || "ftp".equals(protocol)) {
 
-  if (host != null) {
+  if (host != null && url.getAuthority() != null) {
 String newHost = host.toLowerCase(Locale.ROOT); // lowercase host
 if (!host.equals(newHost)) {
   host = newHost;
@@ -89,6 +89,9 @@ public class BasicURLNormalizer extends Configured implements 
URLNormalizer {
   // etc.) which will likely cause a change if left away
   changed = true;
 }
+  } else {
+// no host or authority: recompose the URL from components
+changed = true;
   }
 
   if (port == url.getDefaultPort()) { // uses default port

http://git-wip-us.apache.org/repos/asf/nutch/blob/700857d1/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
--
diff --git 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 006c1a3..1d5d99e 100644
--- 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -100,6 +100,12 @@ public class TestBasicURLNormalizer {
 "http://foo.com/aa/bb/foo.html;);
 normalizeTest("http://foo.com/aa?referer=http://bar.com;,
 "http://foo.com/aa?referer=http://bar.com;);
+// check for NPEs when normalizing URLs without host (authority)
+normalizeTest("file:///foo/bar.txt", "file:///foo/bar.txt");
+normalizeTest("ftp:/", "ftp:/");
+normalizeTest("http:", "http:/");
+normalizeTest("http:;, "http:/");
+normalizeTest("http:///;, "http:/");
   }
 
   private void normalizeTest(String weird, String normal) throws Exception {



  1   2   3   4   5   6   7   8   9   >