svn commit: r1387357 - in /nutch/trunk: CHANGES.txt build.xml
Author: snagel Date: Tue Sep 18 20:54:05 2012 New Revision: 1387357 URL: http://svn.apache.org/viewvc?rev=1387357view=rev Log: NUTCH-1415 release packages to contain top level folder apache-nutch-x.x Modified: nutch/trunk/CHANGES.txt nutch/trunk/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1387357r1=1387356r2=1387357view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Sep 18 20:54:05 2012 @@ -2,6 +2,8 @@ Nutch Change Log (trunk) Current Development: +* NUTCH-1415 release packages to contain top level folder apache-nutch-x.x (snagel) + * NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy via lewismc) * NUTCH-1470 Ensure test files are included for runtime testing (lewismc) Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1387357r1=1387356r2=1387357view=diff == --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Tue Sep 18 20:54:05 2012 @@ -698,14 +698,13 @@ !-- == -- target name=tar-src depends=package-src description=-- generate src.tar.gz distribution package tar compression=gzip longfile=gnu - destfile=${src.dist.version.dir}.tar.gz basedir=${src.dist.version.dir} - tarfileset dir=${dist.dir} mode=664 - exclude name=${src.dist.version.dir}/bin/* / - exclude name=${src.dist.version.dir}/runtime/* / -include name=${src.dist.version.dir}/** / + destfile=${src.dist.version.dir}.tar.gz + tarfileset dir=${src.dist.version.dir} mode=664 prefix=${final.name} +exclude name=src/bin/* / +include name=** / /tarfileset - tarfileset dir=${dist.dir} mode=755 -include name=${src.dist.version.dir}/bin/* / + tarfileset dir=${src.dist.version.dir} mode=755 prefix=${final.name} +include name=src/bin/* / /tarfileset /tar /target @@ -715,13 +714,13 @@ !-- == -- target name=tar-bin depends=package-bin description=-- generate bin.tar.gz distribution package tar compression=gzip longfile=gnu - destfile=${bin.dist.version.dir}.tar.gz basedir=${bin.dist.version.dir} - tarfileset dir=${dist.dir} mode=664 - exclude name=${bin.dist.version.dir}/bin/* / -include name=${bin.dist.version.dir}/** / + destfile=${bin.dist.version.dir}.tar.gz + tarfileset dir=${bin.dist.version.dir} mode=664 prefix=${final.name} +exclude name=bin/* / +include name=** / /tarfileset - tarfileset dir=${dist.dir} mode=755 -include name=${bin.dist.version.dir}/bin/* / + tarfileset dir=${bin.dist.version.dir} mode=755 prefix=${final.name} +include name=bin/* / /tarfileset /tar /target @@ -731,14 +730,13 @@ !-- == -- target name=zip-src depends=package-src description=-- generate src.zip distribution package zip compress=true casesensitive=yes - destfile=${src.dist.version.dir}.zip basedir=${src.dist.version.dir} - zipfileset dir=${dist.dir} filemode=664 - exclude name=${src.dist.version.dir}/bin/* / - exclude name=${src.dist.version.dir}/runtime/* / - include name=${src.dist.version.dir}/** / + destfile=${src.dist.version.dir}.zip + zipfileset dir=${src.dist.version.dir} filemode=664 prefix=${final.name} + exclude name=src/bin/* / + include name=** / /zipfileset - zipfileset dir=${dist.dir} filemode=755 - include name=${src.dist.version.dir}/bin/* / + zipfileset dir=${src.dist.version.dir} filemode=755 prefix=${final.name} + include name=src/bin/* / /zipfileset /zip /target @@ -746,15 +744,15 @@ !-- == -- !-- Make bin release zip -- !-- == -- - target name=zip-bin depends=package-bin description=-- generate src.zip distribution package + target name=zip-bin depends=package-bin description=-- generate bin.zip distribution package zip compress=true casesensitive=yes - destfile=${bin.dist.version.dir}.zip basedir=${bin.dist.version.dir} - zipfileset dir=${dist.dir} filemode=664 - exclude name=${bin.dist.version.dir}/bin/* / - include name=${bin.dist.version.dir}/** / + destfile=${bin.dist.version.dir}.zip + zipfileset dir=${bin.dist.version.dir} filemode=664 prefix=${final.name} + exclude name=bin/* / + include name=** / /zipfileset - zipfileset dir=${dist.dir} filemode=755 - include name
svn commit: r1396796 - in /nutch/trunk: CHANGES.txt conf/regex-normalize.xml.template src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test src/plugin/urlnormalizer-regex/sample/regex-nor
Author: snagel Date: Wed Oct 10 21:06:27 2012 New Revision: 1396796 URL: http://svn.apache.org/viewvc?rev=1396796view=rev Log: NUTCH-706 Url regex normalizer: pattern for session id removal not to match newsId Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/regex-normalize.xml.template nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1396796r1=1396795r2=1396796view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Oct 10 21:06:27 2012 @@ -2,6 +2,8 @@ Nutch Change Log (trunk) Current Development: +* NUTCH-706 Url regex normalizer: pattern for session id removal not to match newsId (Meghna Kukreja via snagel) + * NUTCH-1415 release packages to contain top level folder apache-nutch-x.x (snagel) * NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy via lewismc) Modified: nutch/trunk/conf/regex-normalize.xml.template URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1396796r1=1396795r2=1396796view=diff == --- nutch/trunk/conf/regex-normalize.xml.template (original) +++ nutch/trunk/conf/regex-normalize.xml.template Wed Oct 10 21:06:27 2012 @@ -29,7 +29,7 @@ !-- removes session ids from urls (such as jsessionid and PHPSESSID) -- regex - pattern([;_]?((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern + pattern([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern substitution$4/substitution /regex Modified: nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1396796r1=1396795r2=1396796view=diff == --- nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test Wed Oct 10 21:06:27 2012 @@ -11,6 +11,8 @@ http://www.foo.com/foo.html;jsessionid=1 http://www.foo.com/foo.html?param=1another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html?param=1another=2 http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1another=2 http://www.foo.com/foo.html?param=1another=2 http://www.foo.com/foo.php?x=1sid=xyzsomething=1 http://www.foo.com/foo.php?x=1something=1 +# but NewsId is not a session id (NUTCH-706, NUTCH-1328) +http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 # test removal default pages http://www.foo.com/home/index.html http://www.foo.com/home/ Modified: nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1396796r1=1396795r2=1396796view=diff == --- nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml Wed Oct 10 21:06:27 2012 @@ -13,7 +13,7 @@ !-- removes session ids from urls (such as jsessionid and PHPSESSID) -- regex - pattern([;_]?((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern + pattern([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern substitution$4/substitution /regex
svn commit: r1396817 - in /nutch/trunk: conf/regex-normalize.xml.template src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test src/plugin/urlnormalizer-regex/sample/regex-normalize-defau
Author: snagel Date: Wed Oct 10 21:54:37 2012 New Revision: 1396817 URL: http://svn.apache.org/viewvc?rev=1396817view=rev Log: NUTCH-706 (applied correct patch) Modified: nutch/trunk/conf/regex-normalize.xml.template nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml Modified: nutch/trunk/conf/regex-normalize.xml.template URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1396817r1=1396816r2=1396817view=diff == --- nutch/trunk/conf/regex-normalize.xml.template (original) +++ nutch/trunk/conf/regex-normalize.xml.template Wed Oct 10 21:54:37 2012 @@ -29,7 +29,7 @@ !-- removes session ids from urls (such as jsessionid and PHPSESSID) -- regex - pattern([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern + pattern(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern substitution$4/substitution /regex Modified: nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1396817r1=1396816r2=1396817view=diff == --- nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test Wed Oct 10 21:54:37 2012 @@ -11,8 +11,13 @@ http://www.foo.com/foo.html;jsessionid=1 http://www.foo.com/foo.html?param=1another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html?param=1another=2 http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1another=2 http://www.foo.com/foo.html?param=1another=2 http://www.foo.com/foo.php?x=1sid=xyzsomething=1 http://www.foo.com/foo.php?x=1something=1 -# but NewsId is not a session id (NUTCH-706, NUTCH-1328) +http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 http://www.foo.com/foo.html +http://www.foo.com/foo.php?_sessionid=qmyrcedtoutputformat=htmlpath=/3_images/foo http://www.foo.com/foo.php?outputformat=htmlpath=/3_images/foo +http://www.foo.com/foo.php?_pid=2_spid=0lang=en_sessionid=e36902d5bb2d0d922fc24b43 http://www.foo.com/foo.php?_pid=2_spid=0lang=en +http://www.foo.com/foo.php?app=contentcontent=overviewlang=en_sid=587fba8f825b05844526519fdb7d75c8b=35m=47 http://www.foo.com/foo.php?app=contentcontent=overviewlang=enb=35m=47 +# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328) http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 +http://www.foo.com/home.cfm?language=encountry=ukaddressid=250646pagingpos=0 http://www.foo.com/home.cfm?language=encountry=ukaddressid=250646pagingpos=0 # test removal default pages http://www.foo.com/home/index.html http://www.foo.com/home/ Modified: nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1396817r1=1396816r2=1396817view=diff == --- nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml Wed Oct 10 21:54:37 2012 @@ -13,7 +13,7 @@ !-- removes session ids from urls (such as jsessionid and PHPSESSID) -- regex - pattern([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern + pattern(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|amp;|#|$)/pattern substitution$4/substitution /regex
svn commit: r1401458 - /nutch/branches/2.x/CHANGES.txt
Author: snagel Date: Tue Oct 23 20:47:16 2012 New Revision: 1401458 URL: http://svn.apache.org/viewvc?rev=1401458view=rev Log: NUTCH-1344 BasicURLNormalizer to normalize https same as http - forgot to add committer Modified: nutch/branches/2.x/CHANGES.txt Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1401458r1=1401457r2=1401458view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Tue Oct 23 20:47:16 2012 @@ -8,7 +8,7 @@ Release 2.2 - Current Development * NUTCH-874 Make sure all plugins in src/plugin are compatible with Nutch 2.0 and Gora (part 1) (Kiran Chitturi via lewismc) -* NUTCH-1344 BasicURLNormalizer to normalize https same as http +* NUTCH-1344 BasicURLNormalizer to normalize https same as http (snagel) * NUTCH-706 Url regex normalizer: pattern for session id removal not to match newsId (Meghna Kukreja via snagel)
svn commit: r1401459 - in /nutch/trunk: CHANGES.txt src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
Author: snagel Date: Tue Oct 23 20:51:35 2012 New Revision: 1401459 URL: http://svn.apache.org/viewvc?rev=1401459view=rev Log: NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1401459r1=1401458r2=1401459view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Oct 23 20:51:35 2012 @@ -2,6 +2,8 @@ Nutch Change Log (trunk) Current Development: +* NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns (snagel) + * NUTCH-1341 NotModified time set to now but page not modified (markus) * NUTCH-1215 UpdateDB should not require segment as input (markus) Modified: nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1401459r1=1401458r2=1401459view=diff == --- nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Tue Oct 23 20:51:35 2012 @@ -247,7 +247,14 @@ public class RegexURLNormalizer extends } if (patternValue != null subValue != null) { Rule rule = new Rule(); - rule.pattern = Pattern.compile(patternValue); + try { +rule.pattern = Pattern.compile(patternValue); + } catch (PatternSyntaxException e) { +if (LOG.isErrorEnabled()) { + LOG.error(skipped rule: + patternValue + - + subValue + : invalid regular expression pattern: + e); +} +continue; + } rule.substitution = subValue; rules.add(rule); }
svn commit: r1461854 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java src/java/org/apache/nutch/parse/ParserChecker.java
Author: snagel Date: Wed Mar 27 21:31:42 2013 New Revision: 1461854 URL: http://svn.apache.org/r1461854 Log: parsechecker and indexchecker to report truncated content Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1461854r1=1461853r2=1461854view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Mar 27 21:31:42 2013 @@ -2,6 +2,8 @@ Nutch Change Log (trunk): Current Development +* NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel) + * NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel + lewismc) * NUTCH-1047 Pluggable indexing backends (jnioche) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461854r1=1461853r2=1461854view=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Mar 27 21:31:42 2013 @@ -35,6 +35,7 @@ import org.apache.nutch.indexer.NutchDoc import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseSegment; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; @@ -105,6 +106,10 @@ public class IndexingFiltersChecker exte // store the guessed content type in the crawldatum datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType)); +if (ParseSegment.isTruncated(content)) { + LOG.warn(Content is truncated, parse may fail!); +} + if (LOG.isInfoEnabled()) { LOG.info(parsing: + url); LOG.info(contentType: + contentType); Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1461854r1=1461853r2=1461854view=diff == --- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed Mar 27 21:31:42 2013 @@ -106,6 +106,10 @@ public class ParserChecker implements To return (-1); } +if (ParseSegment.isTruncated(content)) { + LOG.warn(Content is truncated, parse may fail!); +} + ParseResult parseResult = new ParseUtil(conf).parse(content); // Calculate the signature
svn commit: r1461857 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java src/java/org/apache/nutch/parse/ParserChecker.java
Author: snagel Date: Wed Mar 27 21:33:38 2013 New Revision: 1461857 URL: http://svn.apache.org/r1461857 Log: parsechecker and indexchecker to report truncated content Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1461857r1=1461856r2=1461857view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Wed Mar 27 21:33:38 2013 @@ -2,6 +2,8 @@ Nutch Change Log Release 2.2 - Current Development +* NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel) + * NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel via lewismc) * NUTCH-1038 Port IndexingFiltersChecker to 2.0 (snagel via lewismc) Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461857r1=1461856r2=1461857view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Mar 27 21:33:38 2013 @@ -28,6 +28,7 @@ import org.apache.hadoop.util.ToolRunner import org.apache.nutch.crawl.CrawlStatus; import org.apache.nutch.parse.ParseStatusUtils; import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParserJob; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; @@ -109,6 +110,10 @@ public class IndexingFiltersChecker exte LOG.info(contentType: + contentType); } +if (ParserJob.isTruncated(url, page)) { + LOG.warn(Content is truncated, parse may fail!); +} + (new ParseUtil(conf)).process(url, page); if (!ParseStatusUtils.isSuccess(page.getParseStatus())) { LOG.warn(Problem with parse - check log); Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1461857r1=1461856r2=1461857view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Wed Mar 27 21:33:38 2013 @@ -121,6 +121,10 @@ public class ParserChecker implements To page.setContentType(new Utf8(contentType)); +if (ParserJob.isTruncated(url, page)) { + LOG.warn(Content is truncated, parse may fail!); +} + Parse parse = new ParseUtil(conf).parse(url, page); if (parse == null) {
svn commit: r1480484 - in /nutch/branches/2.x: CHANGES.txt conf/schema-solr4.xml conf/schema.xml src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Author: snagel Date: Wed May 8 22:04:04 2013 New Revision: 1480484 URL: http://svn.apache.org/r1480484 Log: NUTCH-956 solrindex issues Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/schema-solr4.xml nutch/branches/2.x/conf/schema.xml nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1480484r1=1480483r2=1480484view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Wed May 8 22:04:04 2013 @@ -2,6 +2,8 @@ Nutch Change Log Release 2.2 - Current Development +* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel) + * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp) * NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp) Modified: nutch/branches/2.x/conf/schema-solr4.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1480484r1=1480483r2=1480484view=diff == --- nutch/branches/2.x/conf/schema-solr4.xml (original) +++ nutch/branches/2.x/conf/schema-solr4.xml Wed May 8 22:04:04 2013 @@ -346,6 +346,9 @@ !-- fields for creativecommons plugin -- field name=cc type=string stored=true indexed=true multiValued=true/ + +!-- fields for tld plugin -- +field name=tld type=string stored=false indexed=false/ /fields uniqueKeyid/uniqueKey defaultSearchFieldtext/defaultSearchField Modified: nutch/branches/2.x/conf/schema.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1480484r1=1480483r2=1480484view=diff == --- nutch/branches/2.x/conf/schema.xml (original) +++ nutch/branches/2.x/conf/schema.xml Wed May 8 22:04:04 2013 @@ -114,6 +114,9 @@ !-- fields for creativecommons plugin -- field name=cc type=string stored=true indexed=true multiValued=true/ + +!-- fields for tld plugin -- +field name=tld type=string stored=false indexed=false/ /fields uniqueKeyid/uniqueKey defaultSearchFieldcontent/defaultSearchField Modified: nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1480484r1=1480483r2=1480484view=diff == --- nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed May 8 22:04:04 2013 @@ -44,10 +44,12 @@ import org.slf4j.LoggerFactory; /** * Add (or reset) a few metaData properties as respective fields (if they are - * available), so that they can be displayed by more.jsp (called by search.jsp). + * available), so that they can be accurately used within the search index. * - * content-type is indexed to support query by type: last-modifed is indexed to - * support query by date: + * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length from the HTTP + * header, 'type' field is indexed to support query by type and finally the 'title' field is an attempt + * to reset the title if a content-disposition hint exists. The logic is that such a presence is indicative + * that the content provider wants the filename therein to be used as the title. * * Still need to make content-length searchable! * @@ -171,7 +173,9 @@ public class MoreIndexingFilter implemen */ private NutchDocument addType(NutchDocument doc, WebPage page, String url) { String mimeType = null; -Utf8 contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE)); +Utf8 contentType = page.getContentType(); +if (contentType == null) + contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE)); if (contentType == null) { // Note by Jerome Charron on 20050415: // Content Type not solved by a previous plugin @@ -194,13 +198,11 @@ public class MoreIndexingFilter implemen return doc; } -//String scontentType = mimeType.getName(); - doc.add(type, mimeType); // Check if we need to split the content type in sub parts -if ( null != contentType conf.getBoolean(moreIndexingFilter.indexMimeTypeParts, true)) { - String[] parts = getParts(contentType.toString()); +if (conf.getBoolean(moreIndexingFilter.indexMimeTypeParts, true)) { + String[] parts = getParts
svn commit: r1480485 - in /nutch/trunk: CHANGES.txt conf/schema-solr4.xml conf/schema.xml src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Author: snagel Date: Wed May 8 22:04:53 2013 New Revision: 1480485 URL: http://svn.apache.org/r1480485 Log: NUTCH-956 solrindex issues Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/schema-solr4.xml nutch/trunk/conf/schema.xml nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1480485r1=1480484r2=1480485view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed May 8 22:04:53 2013 @@ -2,6 +2,8 @@ Nutch Change Log (trunk): Current Development +* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel) + * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp) * NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp) Modified: nutch/trunk/conf/schema-solr4.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1480485r1=1480484r2=1480485view=diff == --- nutch/trunk/conf/schema-solr4.xml (original) +++ nutch/trunk/conf/schema-solr4.xml Wed May 8 22:04:53 2013 @@ -345,6 +345,9 @@ !-- fields for creativecommons plugin -- field name=cc type=string stored=true indexed=true multiValued=true/ + +!-- fields for tld plugin -- +field name=tld type=string stored=false indexed=false/ /fields uniqueKeyid/uniqueKey defaultSearchFieldtext/defaultSearchField Modified: nutch/trunk/conf/schema.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1480485r1=1480484r2=1480485view=diff == --- nutch/trunk/conf/schema.xml (original) +++ nutch/trunk/conf/schema.xml Wed May 8 22:04:53 2013 @@ -114,6 +114,9 @@ !-- fields for creativecommons plugin -- field name=cc type=string stored=true indexed=true multiValued=true/ + +!-- fields for tld plugin -- +field name=tld type=string stored=false indexed=false/ /fields uniqueKeyid/uniqueKey defaultSearchFieldcontent/defaultSearchField Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1480485r1=1480484r2=1480485view=diff == --- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed May 8 22:04:53 2013 @@ -52,12 +52,13 @@ import org.apache.commons.lang.StringUti import org.apache.commons.lang.time.DateUtils; /** - * Add (or reset) a few metaData properties as respective fields - * (if they are available), so that they can be displayed by more.jsp - * (called by search.jsp). - * - * content-type is indexed to support query by type: - * last-modifed is indexed to support query by date: + * Add (or reset) a few metaData properties as respective fields (if they are + * available), so that they can be accurately used within the search index. + * + * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length from the HTTP + * header, 'type' field is indexed to support query by type and finally the 'title' field is an attempt + * to reset the title if a content-disposition hint exists. The logic is that such a presence is indicative + * that the content provider wants the filename therein to be used as the title. * * Still need to make content-length searchable! *
svn commit: r1494776 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
Author: snagel Date: Wed Jun 19 21:26:07 2013 New Revision: 1494776 URL: http://svn.apache.org/r1494776 Log: NUTCH-1245 URL gone with 404 after db.fetch.interval.max stays db_unfetched in CrawlDb and is generated over and over again Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1494776r1=1494775r2=1494776view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jun 19 21:26:07 2013 @@ -2,6 +2,8 @@ Nutch Change Log (trunk): Current Development +* NUTCH-1245 URL gone with 404 after db.fetch.interval.max stays db_unfetched in CrawlDb (snagel) + * NUTCH-1527 Elasticsearch indexer (lufeng + markus) * NUTCH-1475 Index-More Plugin -- A better fall back value for date field (James Sullivan, snagel via lewismc) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1494776r1=1494775r2=1494776view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Wed Jun 19 21:26:07 2013 @@ -85,9 +85,8 @@ public abstract class AbstractFetchSched /** * This method specifies how to schedule refetching of pages - * marked as GONE. Default implementation increases fetchInterval by 50%, - * and if it exceeds the codemaxInterval/code it calls - * {@link #forceRefetch(Text, CrawlDatum, boolean)}. + * marked as GONE. Default implementation increases fetchInterval by 50% + * but the value may never exceed codemaxInterval/code. * * @param url URL of the page. * @@ -102,9 +101,11 @@ public abstract class AbstractFetchSched long prevFetchTime, long prevModifiedTime, long fetchTime) { // no page is truly GONE ... just increase the interval by 50% // and try much later. -datum.setFetchInterval(datum.getFetchInterval() * 1.5f); +if ((datum.getFetchInterval() * 1.5f) maxInterval) + datum.setFetchInterval(datum.getFetchInterval() * 1.5f); +else + datum.setFetchInterval(maxInterval * 0.9f); datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000); -if (maxInterval datum.getFetchInterval()) forceRefetch(url, datum, false); return datum; }
svn commit: r1494785 - /nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Author: snagel Date: Wed Jun 19 22:22:00 2013 New Revision: 1494785 URL: http://svn.apache.org/r1494785 Log: NUTCH-1475 (fix after fix) fill field date with fetch time (as before) if modified time is unset Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1494785r1=1494784r2=1494785view=diff == --- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed Jun 19 22:22:00 2013 @@ -105,7 +105,7 @@ public class MoreIndexingFilter implemen if (time == -1) { // if no last-modified specified in HTTP header time = datum.getModifiedTime(); // use value in CrawlDatum if (time = 0) {// if also unset -time = new Date().getTime(); // use current time +time = datum.getFetchTime(); // use time the fetch took place (fetchTime of fetchDatum) } }
svn commit: r1497557 - in /nutch/trunk: ./ conf/ src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/ src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/
Author: snagel Date: Thu Jun 27 20:16:22 2013 New Revision: 1497557 URL: http://svn.apache.org/r1497557 Log: NUTCH-1580 index-static returns object instead of value for index.static Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1497557r1=1497556r2=1497557view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jun 27 20:16:22 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1580 index-static returns object instead of value for index.static (Antoinette, lewismc, snagel) + * NUTCH-1126 JUnit test for urlfilter-prefix (Talat UYARER via markus) Apache Nutch 1.7 Release - 06/20/2013 (mm/dd/) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1497557r1=1497556r2=1497557view=diff == --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Thu Jun 27 20:16:22 2013 @@ -1241,9 +1241,11 @@ nameindex.static/name value/value description - A simple plugin called at indexing that adds fields with static data. - You can specify a list of fieldname:fieldcontent per nutch job. - It can be useful when collections can't be created by urlpatterns, + Used by plugin index-static to adds fields with static data at indexing time. + You can specify a comma-separated list of fieldname:fieldcontent per Nutch job. + Each fieldcontent can have multiple values separated by space, e.g., +field1:value1.1 value1.2 value1.3,field2:value2.1 value2.2 ... + It can be useful when collections can't be created by URL patterns, like in subcollection, but on a job-basis. /description /property Modified: nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java?rev=1497557r1=1497556r2=1497557view=diff == --- nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java (original) +++ nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java Thu Jun 27 20:16:22 2013 @@ -57,7 +57,9 @@ public class StaticFieldIndexer implemen if (this.addStaticFields == true) { for (EntryString, String[] entry : this.fields.entrySet()) { -doc.add(entry.getKey(), entry.getValue()); +for (String val : entry.getValue()) { + doc.add(entry.getKey(), val); +} } } return doc; Modified: nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java?rev=1497557r1=1497556r2=1497557view=diff == --- nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java (original) +++ nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java Thu Jun 27 20:16:22 2013 @@ -100,11 +100,11 @@ public class TestStaticFieldIndexerTest assertNotNull(doc); assertFalse(test if doc is not empty, doc.getFieldNames().isEmpty()); assertEquals(test if doc has 3 fields, 3, doc.getFieldNames().size()); -assertEquals(test if doc has field1, val1, -((String[]) doc.getField(field1).getValues().get(0))[0]); -assertEquals(test if doc has field2, val2, -((String[]) doc.getField(field2).getValues().get(0))[0]); -assertEquals(test if doc has field4, val4, -((String[]) doc.getField(field4).getValues().get(0))[0]); +assertTrue(test if doc has field1, doc.getField(field1).getValues() +.contains(val1)); +assertTrue(test if doc has field2, doc.getField(field2).getValues() +.contains(val2)); +assertTrue(test if doc has field4, doc.getField(field4).getValues() +.contains(val4)); } }
svn commit: r1507130 - in /nutch/trunk: CHANGES.txt conf/log4j.properties
Author: snagel Date: Thu Jul 25 21:14:45 2013 New Revision: 1507130 URL: http://svn.apache.org/r1507130 Log: NUTCH-1587 misspelled property threshold in conf/log4j.properties Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/log4j.properties Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1507130r1=1507129r2=1507130view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jul 25 21:14:45 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1587 misspelled property threshold in conf/log4j.properties (snagel) + * NUTCH-1604 ProtocolFactory not thread-safe (jnioche) * NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus) Modified: nutch/trunk/conf/log4j.properties URL: http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1507130r1=1507129r2=1507130view=diff == --- nutch/trunk/conf/log4j.properties (original) +++ nutch/trunk/conf/log4j.properties Thu Jul 25 21:14:45 2013 @@ -6,7 +6,7 @@ hadoop.log.file=hadoop.log log4j.rootLogger=INFO,DRFA # Logging Threshold -log4j.threshhold=ALL +log4j.threshold=ALL #special logging requirements for some commandline tools log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout
svn commit: r1507131 - in /nutch/branches/2.x: CHANGES.txt conf/log4j.properties
Author: snagel Date: Thu Jul 25 21:15:02 2013 New Revision: 1507131 URL: http://svn.apache.org/r1507131 Log: NUTCH-1587 misspelled property threshold in conf/log4j.properties Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/log4j.properties Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1507131r1=1507130r2=1507131view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Thu Jul 25 21:15:02 2013 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1587 misspelled property threshold in conf/log4j.properties (snagel) + * NUTCH-1604 ProtocolFactory not thread-safe (jnioche) * NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus) Modified: nutch/branches/2.x/conf/log4j.properties URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/log4j.properties?rev=1507131r1=1507130r2=1507131view=diff == --- nutch/branches/2.x/conf/log4j.properties (original) +++ nutch/branches/2.x/conf/log4j.properties Thu Jul 25 21:15:02 2013 @@ -21,7 +21,7 @@ hadoop.log.file=hadoop.log log4j.rootLogger=INFO,DRFA # Logging Threshold -log4j.threshhold=ALL +log4j.threshold=ALL #special logging requirements for some commandline tools log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout
svn commit: r1511479 - in /nutch/trunk: CHANGES.txt src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
Author: snagel Date: Wed Aug 7 20:44:01 2013 New Revision: 1511479 URL: http://svn.apache.org/r1511479 Log: NUTCH-911 protocol-file to return proper protocol status for notmodified, gone, access_denied Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1511479r1=1511478r2=1511479view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Aug 7 20:44:01 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-911 protocol-file to return proper protocol status (Peter Lundberg via snagel) + * NUTCH-806 Merge CrawlDBScanner with CrawlDBReader (jnioche) * NUTCH-1587 misspelled property threshold in conf/log4j.properties (snagel) Modified: nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1511479r1=1511478r2=1511479view=diff == --- nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original) +++ nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Wed Aug 7 20:44:01 2013 @@ -105,6 +105,15 @@ public class File implements Protocol { if (code == 200) { // got a good response return new ProtocolOutput(response.toContent()); // return it +} else if (code == 304) { // got not modified + return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTMODIFIED); + +} else if (code == 401) { // access denied / no read permissions + return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.ACCESS_DENIED)); + +} else if (code == 404) { // no such file + return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTFOUND); + } else if (code = 300 code 400) { // handle redirect if (redirects == MAX_REDIRECTS) throw new FileException(Too many redirects: + url);
svn commit: r1511496 - in /nutch/branches/2.x: CHANGES.txt src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
Author: snagel Date: Wed Aug 7 21:10:17 2013 New Revision: 1511496 URL: http://svn.apache.org/r1511496 Log: NUTCH-911 protocol-file to return proper protocol status for notmodified, gone, access_denied Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1511496r1=1511495r2=1511496view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Wed Aug 7 21:10:17 2013 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-911 protocol-file to return proper protocol status (Peter Lundberg via snagel) + * NUTCH-1587 misspelled property threshold in conf/log4j.properties (snagel) * NUTCH-1604 ProtocolFactory not thread-safe (jnioche) Modified: nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1511496r1=1511495r2=1511496view=diff == --- nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original) +++ nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Wed Aug 7 21:10:17 2013 @@ -114,6 +114,16 @@ public class File implements Protocol { if (code == 200) { // got a good response return new ProtocolOutput(response.toContent()); // return it + +} else if (code == 304) { // got not modified + return new ProtocolOutput(response.toContent(), ProtocolStatusUtils.STATUS_NOTMODIFIED); + +} else if (code == 401) { // access denied / no read permissions + return new ProtocolOutput(response.toContent(), ProtocolStatusUtils.makeStatus(ProtocolStatusUtils.ACCESS_DENIED)); + +} else if (code == 404) { // no such file + return new ProtocolOutput(response.toContent(), ProtocolStatusUtils.STATUS_NOTFOUND); + } else if (code = 300 code 400) { // handle redirect if (redirects == MAX_REDIRECTS) throw new FileException(Too many redirects: + url);
svn commit: r1544341 - /nutch/branches/2.x/src/test/log4j.properties
Author: snagel Date: Thu Nov 21 22:04:13 2013 New Revision: 1544341 URL: http://svn.apache.org/r1544341 Log: NUTCH-1587 misspelled property threshold in log4j.properties Modified: nutch/branches/2.x/src/test/log4j.properties Modified: nutch/branches/2.x/src/test/log4j.properties URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/log4j.properties?rev=1544341r1=1544340r2=1544341view=diff == --- nutch/branches/2.x/src/test/log4j.properties (original) +++ nutch/branches/2.x/src/test/log4j.properties Thu Nov 21 22:04:13 2013 @@ -1,7 +1,7 @@ # log4j configuration used during build and unit tests log4j.rootLogger=info,stdout -log4j.threshhold=INFO +log4j.threshold=INFO log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
svn commit: r1544340 - /nutch/trunk/src/test/log4j.properties
Author: snagel Date: Thu Nov 21 22:03:18 2013 New Revision: 1544340 URL: http://svn.apache.org/r1544340 Log: NUTCH-1587 misspelled property threshold in log4j.properties Modified: nutch/trunk/src/test/log4j.properties Modified: nutch/trunk/src/test/log4j.properties URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/log4j.properties?rev=1544340r1=1544339r2=1544340view=diff == --- nutch/trunk/src/test/log4j.properties (original) +++ nutch/trunk/src/test/log4j.properties Thu Nov 21 22:03:18 2013 @@ -1,7 +1,7 @@ # log4j configuration used during build and unit tests log4j.rootLogger=info,stdout -log4j.threshhold=ALL +log4j.threshold=ALL log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
svn commit: r1560512 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Author: snagel Date: Wed Jan 22 21:13:01 2014 New Revision: 1560512 URL: http://svn.apache.org/r1560512 Log: NUTCH-1413 Record response time Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1560512r1=1560511r2=1560512view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jan 22 21:13:01 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1413 Record response time (Yasin Kılınç, Talat Uyarer, snagel) + * NUTCH-1325 HostDB for Nutch (markus, tejasp) * NUTCH-1680 CrawlDbReader to dump minRetry value (markus) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1560512r1=1560511r2=1560512view=diff == --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Wed Jan 22 21:13:01 2014 @@ -266,6 +266,16 @@ /description /property +property + namehttp.store.responsetime/name + valuetrue/value + descriptionEnables us to record the response time of the + host which is the time period between start connection to end + connection of a pages host. The response time in milliseconds + is stored in CrawlDb in CrawlDatum's meta data under key quot;_rs_quot; + /description +/property + !-- FTP properties -- property Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1560512r1=1560511r2=1560512view=diff == --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Wed Jan 22 21:13:01 2014 @@ -37,6 +37,7 @@ import org.apache.nutch.util.DeflateUtil // Hadoop imports import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; // crawler-commons imports @@ -47,7 +48,8 @@ import crawlercommons.robots.BaseRobotRu */ public abstract class HttpBase implements Protocol { - + public static final Text RESPONSE_TIME = new Text(_rs_); + public static final int BUFFER_SIZE = 8 * 1024; private static final byte[] EMPTY_CONTENT = new byte[0]; @@ -92,6 +94,12 @@ public abstract class HttpBase implement /** Do we use HTTP/1.1? */ protected boolean useHttp11 = false; + + /** + * Record response time in CrawlDatum's meta data, see property + * http.store.responsetime. + */ + protected boolean responseTime = true; /** Skip page if Crawl-Delay longer than this value. */ protected long maxCrawlDelay = -1L; @@ -123,6 +131,7 @@ public abstract class HttpBase implement this.accept = conf.get(http.accept, accept); // backward-compatible default setting this.useHttp11 = conf.getBoolean(http.useHttp11, false); + this.responseTime = conf.getBoolean(http.store.responsetime, true); this.robots.setConf(conf); logConf(); } @@ -137,8 +146,15 @@ public abstract class HttpBase implement String urlString = url.toString(); try { URL u = new URL(urlString); + + long startTime = System.currentTimeMillis(); Response response = getResponse(u, datum, false); // make a request + if(this.responseTime) { +int elapsedTime = (int) (System.currentTimeMillis() - startTime); +datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime)); + } + int code = response.getCode(); byte[] content = response.getContent(); Content c = new Content(u.toString(), u.toString(),
svn commit: r1575350 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/NutchWritable.java
Author: snagel Date: Fri Mar 7 18:13:20 2014 New Revision: 1575350 URL: http://svn.apache.org/r1575350 Log: removed HostDB from Nutch 1.8 trunk: fix build, remove HostDb related entries from change log Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1575350r1=1575349r2=1575350view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Mar 7 18:13:20 2014 @@ -12,16 +12,12 @@ Nutch Development Trunk * NUTCH-1253 Incompatable neko and xerces versions (snagel, lewismc) -* NUTCH-1717 HostDB not to complain if filters/normalizers are disabled (markus) - * NUTCH-1715 RobotRulesParser adds additional '*' to the robots name (tejasp) * NUTCH-356 Plugin repository cache can lead to memory leak (Enrico Triolo, DoÄacan Güney via markus) * NUTCH-1413 Record response time (Yasin Kılınç, Talat Uyarer, snagel) -* NUTCH-1325 HostDB for Nutch (markus, tejasp) - * NUTCH-1680 CrawlDbReader to dump minRetry value (markus) * NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid Yüzügüldü, snagel via lewismc) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=1575350r1=1575349r2=1575350view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Fri Mar 7 18:13:20 2014 @@ -47,8 +47,7 @@ public class NutchWritable extends Gener org.apache.nutch.parse.ParseStatus.class, org.apache.nutch.protocol.Content.class, org.apache.nutch.protocol.ProtocolStatus.class, - org.apache.nutch.scoring.webgraph.LinkDatum.class, - org.apache.nutch.util.hostdb.HostDatum.class, + org.apache.nutch.scoring.webgraph.LinkDatum.class }; }
svn commit: r1575351 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Author: snagel Date: Fri Mar 7 18:15:50 2014 New Revision: 1575351 URL: http://svn.apache.org/r1575351 Log: NUTCH-1706 IndexerMapReduce does not remove db_redir_temp Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1575351r1=1575350r2=1575351view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Mar 7 18:15:50 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1706 IndexerMapReduce does not remove db_redir_temp (markus, snagel) + * NUTCH-1113 SegmentMerger can now be safely used to merge segments (Edward Drapkin, markus, snagel) * NUTCH-1729 Upgrade to Tika 1.5 (jnioche) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1575351r1=1575350r2=1575351view=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Mar 7 18:15:50 2014 @@ -180,36 +180,10 @@ implements MapperText, Writable, Text, dbDatum = datum; } else if (CrawlDatum.hasFetchStatus(datum)) { - // don't index unmodified (empty) pages if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) { fetchDatum = datum; - -/** - * Check if we need to delete 404 NOT FOUND and 301 PERMANENT REDIRECT. - */ -if (delete) { - if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) { -reporter.incrCounter(IndexerStatus, Documents deleted, 1); - -NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); -output.collect(key, action); -return; - } - if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM || - fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || - dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || - dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { -reporter.incrCounter(IndexerStatus, Deleted redirects, 1); -reporter.incrCounter(IndexerStatus, Perm redirects deleted, 1); - -NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); -output.collect(key, action); -return; - } -} } - } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() || CrawlDatum.STATUS_SIGNATURE == datum.getStatus() || CrawlDatum.STATUS_PARSE_META == datum.getStatus()) { @@ -239,6 +213,29 @@ implements MapperText, Writable, Text, LOG.warn(Unrecognized type: +value.getClass()); } } + +// Whether to delete GONE or REDIRECTS +if (delete fetchDatum != null dbDatum != null) { + if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) { +reporter.incrCounter(IndexerStatus, Documents deleted, 1); + +NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); +output.collect(key, action); +return; + } + + if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM || + fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || + dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || + dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { +reporter.incrCounter(IndexerStatus, Deleted redirects, 1); +reporter.incrCounter(IndexerStatus, Perm redirects deleted, 1); + +NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); +output.collect(key, action); +return; + } +} if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) {
svn commit: r1578620 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Author: snagel Date: Mon Mar 17 21:56:32 2014 New Revision: 1578620 URL: http://svn.apache.org/r1578620 Log: NUTCH-1671 indexchecker to add digest field Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1578620r1=1578619r2=1578620view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Mon Mar 17 21:56:32 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1671 indexchecker to add digest field (snagel, lufeng) + * NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class (Yasin Kılınç, lufeng, Sertac TURKEL via snagel) * NUTCH-1478 Parse-metatags and index-metadata plugin for Nutch 2.x series (kiran, Nguyen Manh Tien, Talat UYARER, Vangelis Karvounis via lewismc) Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1578620r1=1578619r2=1578620view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Mon Mar 17 21:56:32 2014 @@ -37,6 +37,7 @@ import org.apache.nutch.protocol.Protoco import org.apache.nutch.protocol.ProtocolStatusUtils; import org.apache.nutch.storage.WebPage; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.StringUtil; import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -122,6 +123,7 @@ public class IndexingFiltersChecker exte } NutchDocument doc = new NutchDocument(); +doc.add(digest, StringUtil.toHexString(page.getSignature())); try { doc = indexers.filter(doc, url, page);
svn commit: r1580046 - in /nutch/trunk: CHANGES.txt src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser
Author: snagel Date: Fri Mar 21 20:56:13 2014 New Revision: 1580046 URL: http://svn.apache.org/r1580046 Log: NUTCH-1733 parse-html to support HTML5 charset definitions Added: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java (with props) Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1580046r1=1580045r2=1580046view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Mar 21 20:56:13 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1733 parse-html to support HTML5 charset definitions (snagel) + * NUTCH-1671 indexchecker to add digest field (snagel, lufeng) Nutch 1.8 - 11/03/2014 (dd/mm/) Modified: nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1580046r1=1580045r2=1580046view=diff == --- nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original) +++ nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Fri Mar 21 20:56:13 2014 @@ -56,6 +56,9 @@ public class HtmlParser implements Parse private static Pattern charsetPattern = Pattern.compile(charset=\\s*([a-z][_\\-0-9a-z]*), Pattern.CASE_INSENSITIVE); + private static Pattern charsetPatternHTML5 = + Pattern.compile(meta\\s+charset\\s*=\\s*[\']?([a-z][_\\-0-9a-z]*)[^]*, + Pattern.CASE_INSENSITIVE); private String parserImpl; @@ -64,13 +67,13 @@ public class HtmlParser implements Parse * emunknown/em encoding, read out 'charset' parameter in the meta tag * from the first codeCHUNK_SIZE/code bytes. * If there's no meta tag for Content-Type or no charset is specified, + * the content is checked for a Unicode Byte Order Mark (BOM). + * This will also cover non-byte oriented character encodings (UTF-16 only). + * If no character set can be determined, * codenull/code is returned. br / - * FIXME: non-byte oriented character encodings (UTF-16, UTF-32) - * can't be handled with this. - * We need to do something similar to what's done by mozilla - * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993). - * See also http://www.w3.org/TR/REC-xml/#sec-guessing - * br / + * See also http://www.w3.org/International/questions/qa-html-encoding-declarations, + * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and + * http://www.w3.org/TR/REC-xml/#sec-guessing * * @param content codebyte[]/code representation of an html file */ @@ -99,6 +102,30 @@ public class HtmlParser implements Parse if (charsetMatcher.find()) encoding = new String(charsetMatcher.group(1)); } +if (encoding == null) { + // check for HTML5 meta charset + metaMatcher = charsetPatternHTML5.matcher(str); + if (metaMatcher.find()) { +encoding = new String(metaMatcher.group(1)); + } +} +if (encoding == null) { + // check for BOM + if (content.length = 3 + content[0] == (byte) 0xEF + content[1] == (byte) 0xBB + content[2] == (byte) 0xBF) { +encoding = UTF-8; + } else if (content.length = 2) { +if (content[0] == (byte)0xFF + content[1] == (byte)0xFE) { + encoding = UTF-16LE; +} else if (content[0] == (byte)0xFE + content[1] == (byte)0xFF) { + encoding = UTF-16BE; +} + } +} return encoding; } Added: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1580046view=auto == --- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java (added) +++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java Fri Mar 21 20:56:13 2014 @@ -0,0 +1,137 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy
svn commit: r1580270 - in /nutch/site: forrest/src/documentation/content/xdocs/downloads.xml publish/downloads.html
Author: snagel Date: Sat Mar 22 18:04:10 2014 New Revision: 1580270 URL: http://svn.apache.org/r1580270 Log: NUTCH-1742 update remaining references of 1.7 - 1.8 Modified: nutch/site/forrest/src/documentation/content/xdocs/downloads.xml nutch/site/publish/downloads.html Modified: nutch/site/forrest/src/documentation/content/xdocs/downloads.xml URL: http://svn.apache.org/viewvc/nutch/site/forrest/src/documentation/content/xdocs/downloads.xml?rev=1580270r1=1580269r2=1580270view=diff == --- nutch/site/forrest/src/documentation/content/xdocs/downloads.xml (original) +++ nutch/site/forrest/src/documentation/content/xdocs/downloads.xml Sat Mar 22 18:04:10 2014 @@ -30,10 +30,10 @@ section titleDownload/title - p Apache Nutch 2.2.1 (src-tar and src-zip only) and 1.7 (src-tar, src-zip, bin-tar and bin-zip) are now available. See + p Apache Nutch 2.2.1 (src-tar and src-zip only) and 1.8 (src-tar, src-zip, bin-tar and bin-zip) are now available. See the a href=http://apache.org/dist/nutch/2.2.1/CHANGES-2.2.1.txt;CHANGES-2.2.1.txt/a, and - a href=http://apache.org/dist/nutch/1.8/CHANGES.txt;CHANGES-1.7.txt/a + a href=http://apache.org/dist/nutch/1.8/CHANGES.txt;CHANGES-1.8.txt/a files for more information on the list of updates in these releases. /p p All Apache Nutch distributions is distributed under the a href=http://www.apache.org/licenses/LICENSE-2.0.html;Apache License, version 2.0/a. @@ -61,7 +61,7 @@ apache-nutch-1.8-src.zip.md5/a/tdtda href=http://apache.org/dist/nutch/1.8/apache-nutch-1.8-src.zip.asc; apache-nutch-1.8-src.zip.asc/a /td/tr trtdApache Nutch 1.8 (bin.tar.gz)/tdtda href=http://www.apache.org/dyn/closer.cgi/nutch/1.8/apache-nutch-1.8-bin.tar.gz; - apache-nutch-1.8-bin.tar.gz/a/td tda href=http://apache.org/dist/nutch/1.7/apache-nutch-1.8-bin.tar.gz.md5; + apache-nutch-1.8-bin.tar.gz/a/td tda href=http://apache.org/dist/nutch/1.8/apache-nutch-1.8-bin.tar.gz.md5; apache-nutch-1.8-bin.tar.gz.md5/a /td tda href=http://apache.org/dist/nutch/1.8/apache-nutch-1.8-bin.tar.gz.asc; apache-nutch-1.8-bin.tar.gz.asc/a /td/tr trtdApache Nutch 1.8 (bin.zip)/tdtda href=http://www.apache.org/dyn/closer.cgi/nutch/1.8/apache-nutch-1.8-bin.zip; Modified: nutch/site/publish/downloads.html URL: http://svn.apache.org/viewvc/nutch/site/publish/downloads.html?rev=1580270r1=1580269r2=1580270view=diff == --- nutch/site/publish/downloads.html (original) +++ nutch/site/publish/downloads.html Sat Mar 22 18:04:10 2014 @@ -272,10 +272,10 @@ document.write(Last Published: + docu a name=N1000E/aa name=Download/a h2 class=h3Download/h2 div class=section -p Apache Nutch 2.2.1 (src-tar and src-zip only) and 1.7 (src-tar, src-zip, bin-tar and bin-zip) are now available. See +p Apache Nutch 2.2.1 (src-tar and src-zip only) and 1.8 (src-tar, src-zip, bin-tar and bin-zip) are now available. See the a href=http://apache.org/dist/nutch/2.2.1/CHANGES-2.2.1.txt;CHANGES-2.2.1.txt/a, and - a href=http://apache.org/dist/nutch/1.8/CHANGES.txt;CHANGES-1.7.txt/a + a href=http://apache.org/dist/nutch/1.8/CHANGES.txt;CHANGES-1.8.txt/a files for more information on the list of updates in these releases. /p p All Apache Nutch distributions is distributed under the a href=http://www.apache.org/licenses/LICENSE-2.0.html;Apache License, version 2.0/a. @@ -320,7 +320,7 @@ document.write(Last Published: + docu tr td colspan=1 rowspan=1Apache Nutch 1.8 (bin.tar.gz)/tdtd colspan=1 rowspan=1a href=http://www.apache.org/dyn/closer.cgi/nutch/1.8/apache-nutch-1.8-bin.tar.gz; - apache-nutch-1.8-bin.tar.gz/a/td td colspan=1 rowspan=1a href=http://apache.org/dist/nutch/1.7/apache-nutch-1.8-bin.tar.gz.md5; + apache-nutch-1.8-bin.tar.gz/a/td td colspan=1 rowspan=1a href=http://apache.org/dist/nutch/1.8/apache-nutch-1.8-bin.tar.gz.md5; apache-nutch-1.8-bin.tar.gz.md5/a /td td colspan=1 rowspan=1a href=http://apache.org/dist/nutch/1.8/apache-nutch-1.8-bin.tar.gz.asc; apache-nutch-1.8-bin.tar.gz.asc/a /td /tr
svn commit: r4777 - /release/nutch/1.7/
Author: snagel Date: Sat Mar 22 18:13:52 2014 New Revision: 4777 Log: NUTCH-1742 removed 1.7 packages from svn (svnpubsub) Removed: release/nutch/1.7/
svn commit: r1583193 - in /nutch/trunk: CHANGES.txt src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
Author: snagel Date: Sun Mar 30 19:58:59 2014 New Revision: 1583193 URL: http://svn.apache.org/r1583193 Log: NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class Added: nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java (with props) Modified: nutch/trunk/CHANGES.txt Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1583193r1=1583192r2=1583193view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Sun Mar 30 19:58:59 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class (Yasin Kılınç, lufeng, Sertac TURKEL via snagel) + * NUTCH-1737 Upgrade to recent JUnit 4.x (lewismc) * NUTCH-1733 parse-html to support HTML5 charset definitions (snagel) Added: nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java?rev=1583193view=auto == --- nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java (added) +++ nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java Sun Mar 30 19:58:59 2014 @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for AdaptiveFetchSchedule. + * + */ +public class TestAdaptiveFetchSchedule extends TestCase { + + private float inc_rate; + private float dec_rate; + private Configuration conf; + private long curTime, lastModified; + private int changed, interval, calculateInterval; + + @Before + public void setUp() throws Exception { +super.setUp(); +conf = NutchConfiguration.create(); +inc_rate = conf.getFloat(db.fetch.schedule.adaptive.inc_rate, 0.2f); +dec_rate = conf.getFloat(db.fetch.schedule.adaptive.dec_rate, 0.2f); +interval = 100; +lastModified = 0; + } + + /** + * Test the core functionality of AdaptiveFetchSchedule. + * + */ + + @Test + public void testAdaptiveFetchSchedule() { + +FetchSchedule fs = new AdaptiveFetchSchedule(); +fs.setConf(conf); + +CrawlDatum p = prepareCrawlDatum(); +Text url = new Text(http://www.example.com;); + +changed = FetchSchedule.STATUS_UNKNOWN; +fs.setFetchSchedule(url, p, p.getFetchTime(), +p.getModifiedTime(), curTime, lastModified, changed); +validateFetchInterval(changed, p.getFetchInterval()); + +changed = FetchSchedule.STATUS_MODIFIED; +fs.setFetchSchedule(url, p, p.getFetchTime(), +p.getModifiedTime(), curTime, lastModified, changed); +validateFetchInterval(changed, p.getFetchInterval()); +p.setFetchInterval(interval); + +changed = FetchSchedule.STATUS_NOTMODIFIED; +fs.setFetchSchedule(url, p, p.getFetchTime(), +p.getModifiedTime(), curTime, lastModified, changed); +validateFetchInterval(changed, p.getFetchInterval()); + + } + + /** + * Prepare a CrawlDatum (STATUS_DB_UNFETCHED) to Test AdaptiveFetchSchedule. + * + * @return properly initialized CrawlDatum + */ + public CrawlDatum prepareCrawlDatum() { +CrawlDatum p = new CrawlDatum(); +p.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); +p.setFetchInterval(interval); +p.setScore(1.0f); +p.setFetchTime(0); +return p; + } + + /** + * + * The Method validates interval values according to changed parameter. + * + * @param changed + * status value to check calculated interval value. + * @param getInterval + * to test IntervalValue from CrawlDatum which is calculated via + * AdaptiveFetchSchedule algorithm. + */ + private void validateFetchInterval(int changed, int getInterval) { + +if (changed == FetchSchedule.STATUS_UNKNOWN) { + assertEquals
svn commit: r1585144 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java
Author: snagel Date: Sat Apr 5 17:06:04 2014 New Revision: 1585144 URL: http://svn.apache.org/r1585144 Log: NUTCH-1735 code dedup fetcher queue redirects Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1585144r1=1585143r2=1585144view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Sat Apr 5 17:06:04 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1735 code dedup fetcher queue redirects (snagel) + * NUTCH-1745 Upgrade to ElasticSearch 1.1.0 (jnioche) * NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class (Yasin Kılınç, lufeng, Sertac TURKEL via snagel) Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1585144r1=1585143r2=1585144view=diff == --- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Apr 5 17:06:04 2014 @@ -731,25 +731,7 @@ public class Fetcher extends Configured refreshTime Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR); if (redirUrl != null) { -CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, -fit.datum.getFetchInterval(), fit.datum.getScore()); -// transfer existing metadata to the redir -newDatum.getMetaData().putAll(fit.datum.getMetaData()); -scfilters.initialScore(redirUrl, newDatum); -if (reprUrl != null) { - newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, - new Text(reprUrl)); -} -fit = FetchItem.create(redirUrl, newDatum, queueMode); -if (fit != null) { - FetchItemQueue fiq = -fetchQueues.getFetchItemQueue(fit.queueID); - fiq.addInProgressFetchItem(fit); -} else { - // stop redirecting - redirecting = false; - reporter.incrCounter(FetcherStatus, FetchItem.notCreated.redirect, 1); -} +queueRedirect(redirUrl, fit); } } break; @@ -772,25 +754,7 @@ public class Fetcher extends Configured urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR); if (redirUrl != null) { - CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, - fit.datum.getFetchInterval(), fit.datum.getScore()); - // transfer existing metadata - newDatum.getMetaData().putAll(fit.datum.getMetaData()); - scfilters.initialScore(redirUrl, newDatum); - if (reprUrl != null) { -newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, -new Text(reprUrl)); - } - fit = FetchItem.create(redirUrl, newDatum, queueMode); - if (fit != null) { -FetchItemQueue fiq = - fetchQueues.getFetchItemQueue(fit.queueID); -fiq.addInProgressFetchItem(fit); - } else { -// stop redirecting -redirecting = false; -reporter.incrCounter(FetcherStatus, FetchItem.notCreated.redirect, 1); - } + queueRedirect(redirUrl, fit); } else { // stop redirecting redirecting = false; @@ -918,6 +882,28 @@ public class Fetcher extends Configured } } +private void queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException { + CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, + fit.datum.getFetchInterval(), fit.datum.getScore()); + // transfer all existing metadata to the redirect + newDatum.getMetaData().putAll(fit.datum.getMetaData()); + scfilters.initialScore(redirUrl, newDatum); + if (reprUrl != null) { +newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, +new Text(reprUrl)); + } + fit = FetchItem.create(redirUrl, newDatum, queueMode); + if (fit != null) { +FetchItemQueue fiq = + fetchQueues.getFetchItemQueue(fit.queueID); +fiq.addInProgressFetchItem
svn commit: r1590315 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReader.java
Author: snagel Date: Sat Apr 26 22:12:46 2014 New Revision: 1590315 URL: http://svn.apache.org/r1590315 Log: NUTCH-1764 readdb to show command-line help if no action (-stats, -dump, etc.) given Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1590315r1=1590314r2=1590315view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Sat Apr 26 22:12:46 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1764 readdb to show command-line help if no action (-stats, -dump, etc.) given (Diaa via snagel) + * NUTCH-1700 Remove deprecated code from creativecommons plugin (lewismc) * NUTCH-1761 Crawl script fails to find job file if not started from inside bin dir (David Hosking, jnioche) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1590315r1=1590314r2=1590315view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Sat Apr 26 22:12:46 2014 @@ -542,7 +542,7 @@ public class CrawlDbReader implements Cl public static void main(String[] args) throws IOException { CrawlDbReader dbr = new CrawlDbReader(); -if (args.length 1) { +if (args.length 2) { System.err.println(Usage: CrawlDbReader crawldb (-stats | -dump out_dir | -topN out_dir [min] | -url url)); System.err.println(\tcrawldb\tdirectory name where crawldb is located); System.err.println(\t-stats [-sort] \tprint overall statistics to System.out);
svn commit: r1592414 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/fetcher/FetcherReducer.java
Author: snagel Date: Sun May 4 20:18:50 2014 New Revision: 1592414 URL: http://svn.apache.org/r1592414 Log: NUTCH-1182 fetcher to log hung threads Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1592414r1=1592413r2=1592414view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Sun May 4 20:18:50 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1182 fetcher to log hung threads (snagel) + * NUTCH-1618 Turn speculative execution off for Fetching (talat) * NUTCH-1657 ORIGINAL_CHAR_ENCODING and CHAR_ENCODING_FOR_CONVERSION never set in HTMLParser (talat) Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1592414r1=1592413r2=1592414view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java Sun May 4 20:18:50 2014 @@ -871,7 +871,24 @@ extends GoraReducerIntWritable, FetchEn // some requests seem to hang, despite all intentions if ((System.currentTimeMillis() - lastRequestStart.get()) timeout) { -LOG.warn(Aborting with + activeThreads + hung threads.); +if (LOG.isWarnEnabled() activeThreads.get() 0) { + LOG.warn(Aborting with + activeThreads + hung threads.); + for (int i = 0; i fetcherThreads.size(); i++) { +FetcherThread thread = fetcherThreads.get(i); +if (thread.isAlive()) { + LOG.warn(Thread # + i + hung while processing + thread.reprUrl); + if (LOG.isDebugEnabled()) { +StackTraceElement[] stack = thread.getStackTrace(); +StringBuilder sb = new StringBuilder(); +sb.append(Stack of thread #).append(i).append(:\n); +for (StackTraceElement s : stack) { + sb.append(s.toString()).append('\n'); +} +LOG.debug(sb.toString()); + } +} + } +} return; }
svn commit: r1594071 - in /nutch: branches/2.x/ branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ trunk/ trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/a
Author: snagel Date: Mon May 12 19:39:43 2014 New Revision: 1594071 URL: http://svn.apache.org/r1594071 Log: NUTCH-1752 Cache robots.txt rules per protocol:host:port Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1594071r1=1594070r2=1594071view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Mon May 12 19:39:43 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel) + * NUTCH-1613 Timeouts in protocol-httpclient when crawling same host with 2 threads (brian44 via jnioche) * NUTCH-1182 fetcher to log hung threads (snagel) Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1594071r1=1594070r2=1594071view=diff == --- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original) +++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Mon May 12 19:39:43 2014 @@ -48,23 +48,38 @@ public class HttpRobotRulesParser extend allowForbidden = conf.getBoolean(http.robots.403.allow, false); } + /** Compose unique key to store and access robot rules in cache for given URL */ + protected static String getCacheKey(URL url) { +String protocol = url.getProtocol().toLowerCase(); // normalize to lower case +String host = url.getHost().toLowerCase(); // normalize to lower case +int port = url.getPort(); +if (port == -1) { + port = url.getDefaultPort(); +} + /* Robot rules apply only to host, protocol, and port where robots.txt is +* hosted (cf. NUTCH-1752). Consequently */ +String cacheKey = protocol + : + host + : + port; +return cacheKey; + } + /** - * The hosts for which the caching of robots rules is yet to be done, - * it sends a Http request to the host corresponding to the {@link URL} - * passed, gets robots file, parses the rules and caches the rules object - * to avoid re-work in future. + * Get the rules from robots.txt which applies for the given {@code url}. + * Robot rules are cached for a unique combination of host, protocol, and + * port. If no rules are found in the cache, a HTTP request is send to fetch + * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the + * rules are cached to avoid re-fetching and re-parsing it again. * - * @param http The {@link Protocol} object - * @param url URL - * - * @return robotRules A {@link BaseRobotRules} object for the rules + * @param http + * The {@link Protocol} object + * @param url + * URL robots.txt applies to + * + * @return {@link BaseRobotRules} holding the rules from robots.txt */ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { -String protocol = url.getProtocol().toLowerCase(); // normalize to lower case -String host = url.getHost().toLowerCase(); // normalize to lower case - -BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + : + host); +String cacheKey = getCacheKey(url); +BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey); boolean cacheRule = true; @@ -114,10 +129,10 @@ public class HttpRobotRulesParser extend } if (cacheRule) { -CACHE.put(protocol + : + host, robotRules); // cache rules for host -if (redir != null !redir.getHost().equals(host)) { +CACHE.put(cacheKey, robotRules); // cache rules for host +if (redir != null !redir.getHost().equalsIgnoreCase(url.getHost())) { // cache also for the redirected host - CACHE.put(protocol + : + redir.getHost(), robotRules); + CACHE.put(getCacheKey(redir), robotRules); } } } Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1594071r1=1594070r2=1594071view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon May 12 19:39:43 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel
svn commit: r1593595 - in /nutch/site: forrest/src/documentation/content/xdocs/index.xml publish/index.html
Author: snagel Date: Fri May 9 18:48:29 2014 New Revision: 1593595 URL: http://svn.apache.org/r1593595 Log: Nutch 1.8 includes Tika 1.5 Modified: nutch/site/forrest/src/documentation/content/xdocs/index.xml nutch/site/publish/index.html Modified: nutch/site/forrest/src/documentation/content/xdocs/index.xml URL: http://svn.apache.org/viewvc/nutch/site/forrest/src/documentation/content/xdocs/index.xml?rev=1593595r1=1593594r2=1593595view=diff == --- nutch/site/forrest/src/documentation/content/xdocs/index.xml (original) +++ nutch/site/forrest/src/documentation/content/xdocs/index.xml Fri May 9 18:48:29 2014 @@ -86,7 +86,7 @@ pThe Apache Nutch PMC are pleased to announce the immediate release of Apache Nutch v1.8, we advise all current users and developers of the 1.X series to upgrade to this release. Alhough this release includes library upgrades to a href=http://code.google.com/p/crawler-commons/;Crawler Commons/a 0.3 and - a href=http://tika.apache.org;Apache Tika/a 1.4, it also provides over 30 bug fixes as well as 18 improvements. + a href=http://tika.apache.org;Apache Tika/a 1.5, it also provides over 30 bug fixes as well as 18 improvements. Please see the a href=http://www.apache.org/dist/nutch/1.8/CHANGES.txt;list of changes/a for a full breakdown, or see the a href=http://s.apache.org/oHY;release report/a. As usual in the 1.X series, this release is made available both as source and binary. Additionally developers Modified: nutch/site/publish/index.html URL: http://svn.apache.org/viewvc/nutch/site/publish/index.html?rev=1593595r1=1593594r2=1593595view=diff == --- nutch/site/publish/index.html (original) +++ nutch/site/publish/index.html Fri May 9 18:48:29 2014 @@ -443,7 +443,7 @@ document.write(Last Published: + docu pThe Apache Nutch PMC are pleased to announce the immediate release of Apache Nutch v1.8, we advise all current users and developers of the 1.X series to upgrade to this release. Alhough this release includes library upgrades to a href=http://code.google.com/p/crawler-commons/;Crawler Commons/a 0.3 and - a href=http://tika.apache.org;Apache Tika/a 1.4, it also provides over 30 bug fixes as well as 18 improvements. + a href=http://tika.apache.org;Apache Tika/a 1.5, it also provides over 30 bug fixes as well as 18 improvements. Please see the a href=http://www.apache.org/dist/nutch/1.8/CHANGES.txt;list of changes/a for a full breakdown, or see the a href=http://s.apache.org/oHY;release report/a. As usual in the 1.X series, this release is made available both as source and binary. Additionally developers
svn commit: r1604291 - in /nutch: branches/2.x/ branches/2.x/conf/ branches/2.x/src/java/org/apache/nutch/fetcher/ branches/2.x/src/java/org/apache/nutch/protocol/ trunk/ trunk/conf/ trunk/src/java/or
Author: snagel Date: Fri Jun 20 22:15:43 2014 New Revision: 1604291 URL: http://svn.apache.org/r1604291 Log: NUTCH-1718 redefine http.robots.agent as additional agent names Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/nutch-default.xml nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1604291r1=1604290r2=1604291view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri Jun 20 22:15:43 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1718 redefine http.robots.agent as additional agent names (snagel, Tejas Patil, Daniel Kugel) + * NUTCH-1796 Ensure Gora object builders are used as oppose to empty constructors (snagel via lewismc) * NUTCH-1590 [SECURITY] Frame injection vulnerability in published Javadoc (jnioche) Modified: nutch/branches/2.x/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1604291r1=1604290r2=1604291view=diff == --- nutch/branches/2.x/conf/nutch-default.xml (original) +++ nutch/branches/2.x/conf/nutch-default.xml Fri Jun 20 22:15:43 2014 @@ -90,11 +90,18 @@ property namehttp.robots.agents/name - value*/value - descriptionThe agent strings we'll look for in robots.txt files, - comma-separated, in decreasing order of precedence. You should - put the value of http.agent.name as the first agent name, and keep the - default * at the end of the list. E.g.: BlurflDev,Blurfl,* + value/value + descriptionAny other agents, apart from 'http.agent.name', that the robots + parser would look for in robots.txt. Multiple agents can be provided using + comma as a delimiter. eg. mybot,foo-spider,bar-crawler + + The ordering of agents does NOT matter and the robots parser would make + decision based on the agent which matches first to the robots rules. + Also, there is NO need to add a wildcard (ie. *) to this string as the + robots parser would smartly take care of a no-match situation. + + If no value is specified, by default HTTP agent (ie. 'http.agent.name') + would be used for user agent matching by the robots parser. /description /property Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java?rev=1604291r1=1604290r2=1604291view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java Fri Jun 20 22:15:43 2014 @@ -255,10 +255,7 @@ public class FetcherJob extends NutchToo } void checkConfiguration() { - -// ensure that a value has been set for the agent name and that that -// agent name is the first value in the agents we advertise for robot -// rules parsing +// ensure that a value has been set for the agent name String agentName = getConf().get(http.agent.name); if (agentName == null || agentName.trim().length() == 0) { String message = Fetcher: No agents listed in 'http.agent.name' @@ -267,23 +264,6 @@ public class FetcherJob extends NutchToo LOG.error(message); } throw new IllegalArgumentException(message); -} else { - - // get all of the agents that we advertise - String agentNames = getConf().get(http.robots.agents); - StringTokenizer tok = new StringTokenizer(agentNames, ,); - ArrayListString agents = new ArrayListString(); - while (tok.hasMoreTokens()) { -agents.add(tok.nextToken().trim()); - } - - // if the first one is not equal to our agent name, log fatal and throw - // an exception - if (!(agents.get(0)).equalsIgnoreCase(agentName)) { -String message = Fetcher: Your 'http.agent.name' value should be -+ listed first in 'http.robots.agents' property.; -LOG.warn(message); - } } } Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1604291r1=1604290r2=1604291view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
svn commit: r1604298 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/util/ branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ branches/2.x/src/plugin/parse-html
Author: snagel Date: Fri Jun 20 22:56:32 2014 New Revision: 1604298 URL: http://svn.apache.org/r1604298 Log: NUTCH-1767 remove special treatment of params in relative links Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1604298r1=1604297r2=1604298view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri Jun 20 22:56:32 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1767 remove special treatment of params in relative links (snagel) + * NUTCH-1718 redefine http.robots.agent as additional agent names (snagel, Tejas Patil, Daniel Kugel) * NUTCH-1796 Ensure Gora object builders are used as oppose to empty constructors (snagel via lewismc) Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1604298r1=1604297r2=1604298view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Fri Jun 20 22:56:32 2014 @@ -28,9 +28,8 @@ import org.apache.nutch.util.domain.Doma public class URLUtil { /** - * Resolve relative URL-s and fix a few java.net.URL errors - * in handling of URLs with embedded params and pure query - * targets. + * Resolve relative URL-s and fix a java.net.URL error + * in handling of URLs with pure query targets. * @param base base url * @param target target url (may be relative) * @return resolved absolute url. Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1604298r1=1604297r2=1604298view=diff == --- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Jun 20 22:56:32 2014 @@ -298,51 +298,6 @@ public class DOMContentUtils { } /** - * Handles cases where the url param information is encoded into the base - * url as opposed to the target. - * p - * If the taget contains params (i.e. ';') information then the target - * params information is assumed to be correct and any base params information - * is ignored. If the base contains params information but the tareget does - * not, then the params information is moved to the target allowing it to be - * correctly determined by the java.net.URL class. - * - * @param base The base URL. - * @param target The target path from the base URL. - * - * @return URL A URL with the params information correctly encoded. - * - * @throws MalformedURLException If the url is not a well formed URL. - */ - private URL fixEmbeddedParams(URL base, String target) -throws MalformedURLException{ - -// the target contains params information or the base doesn't then no -// conversion necessary, return regular URL -if (target.indexOf(';') = 0 || base.toString().indexOf(';') == -1) { - return new URL(base, target); -} - -// get the base url and it params information -String baseURL = base.toString(); -int startParams = baseURL.indexOf(';'); -String params = baseURL.substring(startParams); - -// if the target has a query string then put the params information after -// any path but before the query string, otherwise just append to the path -int startQS = target.indexOf('?'); -if (startQS = 0) { - target = target.substring(0, startQS) + params + -target.substring(startQS); -} -else { - target += params; -} - -return URLUtil.resolveURL(base, target); - } - - /** * This method finds all anchors below the supplied DOM * codenode/code, and creates appropriate {@link Outlink} * records for each (relative
svn commit: r1605204 [3/3] - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/api/ branches/2.x/src/java/org/apache/nutch/api/impl/ branches/2.x/src/java/org/apache/nutch/crawl/ branche
Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1605204r1=1605203r2=1605204view=diff == --- nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Tue Jun 24 21:41:28 2014 @@ -31,7 +31,13 @@ import org.apache.hadoop.conf.Configurat import org.apache.hadoop.conf.Configured; import org.apache.oro.text.regex.*; -/** Converts URLs to a normal form . */ +/** + * Converts URLs to a normal form: + * ul + * liremove dot segments in path: code/.//code or code/..//code/li + * liremove default ports, e.g. 80 for protocol codehttp:///code/li + * /ul + */ public class BasicURLNormalizer extends Configured implements URLNormalizer { public static final Logger LOG = LoggerFactory.getLogger(BasicURLNormalizer.class); Added: nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java?rev=1605204view=auto == --- nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java (added) +++ nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java Tue Jun 24 21:41:28 2014 @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL normalizer performing basic normalizations: remove default ports + * and dot segments in path. + */ +package org.apache.nutch.net.urlnormalizer.basic; Propchange: nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java -- svn:eol-style = native Added: nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java?rev=1605204view=auto == --- nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java (added) +++ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java Tue Jun 24 21:41:28 2014 @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL normalizer renaming hosts to a canonical form listed in the + * configuration file. + */ +package org.apache.nutch.net.urlnormalizer.host; Propchange: nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java -- svn:eol-style = native Added:
svn commit: r1607929 - /nutch/trunk/build.xml
Author: snagel Date: Fri Jul 4 20:15:12 2014 New Revision: 1607929 URL: http://svn.apache.org/r1607929 Log: add dependency init (calling ivy-init) to compile-core-test to fix nightly build failures introduced with NUTCH-1803 Modified: nutch/trunk/build.xml Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1607929r1=1607928r2=1607929view=diff == --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Fri Jul 4 20:15:12 2014 @@ -355,7 +355,7 @@ !-- == -- !-- Compile test code -- !-- == -- - target name=compile-core-test depends=resolve-test, compile-core description=-- compile test code + target name=compile-core-test depends=init, resolve-test, compile-core description=-- compile test code javac encoding=${build.encoding} srcdir=${test.src.dir}
svn commit: r1608130 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/util/ branches/2.x/src/test/org/apache/nutch/util/ branches/2.x/src/testresources/test-mime-util/ trunk/ trunk/sr
Author: snagel Date: Sat Jul 5 20:36:33 2014 New Revision: 1608130 URL: http://svn.apache.org/r1608130 Log: NUTCH-1605 MIME type detector recognizes xlsx as zip file Added: nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java (with props) nutch/branches/2.x/src/testresources/test-mime-util/ nutch/branches/2.x/src/testresources/test-mime-util/test.xlsx (with props) nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java (with props) nutch/trunk/src/testresources/test-mime-util/ nutch/trunk/src/testresources/test-mime-util/test.xlsx (with props) Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1608130r1=1608129r2=1608130view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Sat Jul 5 20:36:33 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel) + * NUTCH-385 Improve description of thread related configuration for Fetcher (jnioche,lufeng) * NUTCH-1798 Crawl script not calling index command correctly (Aaron Bedward via jnioche) Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java?rev=1608130r1=1608129r2=1608130view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java Sat Jul 5 20:36:33 2014 @@ -19,13 +19,16 @@ package org.apache.nutch.util; // JDK imports import java.io.File; +import java.io.IOException; +import java.io.InputStream; // Hadoop imports import org.apache.hadoop.conf.Configuration; // Tika imports import org.apache.tika.Tika; -import org.apache.tika.config.TikaConfig; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; @@ -128,10 +131,10 @@ public final class MimeUtil { * strategies available within Tika. First, the mime type provided in * codetypeName/code is cleaned, with {@link #cleanMimeType(String)}. * Then the cleaned mime type is looked up in the underlying Tika - * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is - * found, then that mime type is used, otherwise URL resolution is - * used to try and determine the mime type. If that means is unsuccessful, and - * if codemime.type.magic/code is enabled in {@link NutchConfiguration}, + * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} + * is found, then that mime type is used, otherwise URL resolution is + * used to try and determine the mime type. However, if + * codemime.type.magic/code is enabled in {@link NutchConfiguration}, * then mime type magic resolution is used to try and obtain a * better-than-the-default approximation of the {@link MimeType}. * @@ -145,24 +148,19 @@ public final class MimeUtil { */ public String autoResolveContentType(String typeName, String url, byte[] data) { String retType = null; -String magicType = null; MimeType type = null; String cleanedMimeType = null; -try { - cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes - .forName(MimeUtil.cleanMimeType(typeName)).getName() - : null; -} catch (MimeTypeException mte) { - // Seems to be a malformed mime type name... -} - +cleanedMimeType = MimeUtil.cleanMimeType(typeName); // first try to get the type from the cleaned type name -try { - type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType) - : null; -} catch (MimeTypeException e) { - type = null; +if (cleanedMimeType != null) { + try { +type = mimeTypes.forName(cleanedMimeType); +cleanedMimeType = type.getName(); + } catch (MimeTypeException mte) { +// Seems to be a malformed mime type name... +cleanedMimeType = null; + } } // if returned null, or if it's the default type then try url resolution @@ -172,8 +170,6 @@ public final class MimeUtil { // mime-type, then guess a mime-type from the url pattern try { -TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); -Tika tika = new Tika(tikaConfig); retType = tika.detect(url) != null ? tika.detect(url) : null; } catch (Exception e
svn commit: r1608135 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/src/bin/crawl branches/2.x/src/bin/nutch trunk/CHANGES.txt trunk/src/bin/crawl trunk/src/bin/nutch
Author: snagel Date: Sat Jul 5 21:13:19 2014 New Revision: 1608135 URL: http://svn.apache.org/r1608135 Log: NUTCH-1566 bin/nutch to allow whitespace in paths Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/bin/crawl nutch/branches/2.x/src/bin/nutch nutch/trunk/CHANGES.txt nutch/trunk/src/bin/crawl nutch/trunk/src/bin/nutch Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1608135r1=1608134r2=1608135view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Sat Jul 5 21:13:19 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1566 bin/nutch to allow whitespace in paths (tejasp, snagel) + * NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel) * NUTCH-385 Improve description of thread related configuration for Fetcher (jnioche,lufeng) Modified: nutch/branches/2.x/src/bin/crawl URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1608135r1=1608134r2=1608135view=diff == --- nutch/branches/2.x/src/bin/crawl (original) +++ nutch/branches/2.x/src/bin/crawl Sat Jul 5 21:13:19 2014 @@ -70,12 +70,12 @@ timeLimitFetch=180 addDays=0 # -bin=`dirname $0` -bin=`cd $bin; pwd` +bin=`dirname $0` +bin=`cd $bin; pwd` # determines whether mode based on presence of job file mode=local -if [ -f ${bin}/../*nutch*.job ]; then +if [ -f ${bin}/../*nutch*.job ]; then mode=distributed fi @@ -92,8 +92,7 @@ if [ $mode = distributed ]; then fi # initial injection -$bin/nutch inject $SEEDDIR -crawlId $CRAWL_ID - +$bin/nutch inject $SEEDDIR -crawlId $CRAWL_ID if [ $? -ne 0 ] then exit $? fi @@ -114,14 +113,14 @@ do batchId=`date +%s`-$RANDOM echo Generating a new fetchlist - $bin/nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId $CRAWL_ID -batchId $batchId + $bin/nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId $CRAWL_ID -batchId $batchId if [ $? -ne 0 ] then exit $? fi echo Fetching : - $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId $CRAWL_ID -threads 50 + $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId $CRAWL_ID -threads 50 if [ $? -ne 0 ] then exit $? @@ -132,7 +131,7 @@ do # enable the skipping of records for the parsing so that a dodgy document # so that it does not fail the full task skipRecordsOptions=-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1 - $bin/nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId $CRAWL_ID + $bin/nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId $CRAWL_ID if [ $? -ne 0 ] then exit $? @@ -140,21 +139,21 @@ do # updatedb with this batch echo CrawlDB update for $CRAWL_ID - $bin/nutch updatedb $commonOptions $batchId -crawlId $CRAWL_ID + $bin/nutch updatedb $commonOptions $batchId -crawlId $CRAWL_ID if [ $? -ne 0 ] then exit $? fi echo Indexing $CRAWL_ID on SOLR index - $SOLRURL - $bin/nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId $CRAWL_ID + $bin/nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId $CRAWL_ID if [ $? -ne 0 ] then exit $? fi echo SOLR dedup - $SOLRURL - $bin/nutch solrdedup $commonOptions $SOLRURL + $bin/nutch solrdedup $commonOptions $SOLRURL if [ $? -ne 0 ] then exit $? Modified: nutch/branches/2.x/src/bin/nutch URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/nutch?rev=1608135r1=1608134r2=1608135view=diff == --- nutch/branches/2.x/src/bin/nutch (original) +++ nutch/branches/2.x/src/bin/nutch Sat Jul 5 21:13:19 2014 @@ -25,6 +25,14 @@ # Default is 1000. # # NUTCH_OPTS Extra Java runtime options. +# Multiple options must be separated by white space. +# +# NUTCH_LOG_DIR Log directory (default: $NUTCH_HOME/logs) +# +# NUTCH_LOGFILE Log file (default: hadoop.log) +# +# NUTCH_CONF_DIR Path(s) to configuration files (default: $NUTCH_HOME/conf). +# Multiple paths must be separated by a colon ':'. # cygwin=false case `uname` in @@ -78,13 +86,13 @@ COMMAND=$1 shift # some directories -THIS_DIR=`dirname $THIS` -NUTCH_HOME=`cd $THIS_DIR/.. ; pwd` +THIS_DIR=`dirname $THIS` +NUTCH_HOME=`cd $THIS_DIR/.. ; pwd` # some Java parameters if [ $NUTCH_JAVA_HOME != ]; then #echo run java in $NUTCH_JAVA_HOME - JAVA_HOME=$NUTCH_JAVA_HOME + JAVA_HOME=$NUTCH_JAVA_HOME fi if [ $JAVA_HOME
svn commit: r1608136 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/plugin/ trunk/ trunk/src/java/org/apache/nutch/plugin/
Author: snagel Date: Sat Jul 5 21:42:20 2014 New Revision: 1608136 URL: http://svn.apache.org/r1608136 Log: NUTCH-1776 Log incorrect plugin.folder file path Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1608136r1=1608135r2=1608136view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Sat Jul 5 21:42:20 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1776 Log incorrect plugin.folder file path (Diaa via snagel) + * NUTCH-1566 bin/nutch to allow whitespace in paths (tejasp, snagel) * NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel) Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1608136r1=1608135r2=1608136view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java Sat Jul 5 21:42:20 2014 @@ -133,6 +133,9 @@ public class PluginManifestParser { } catch (UnsupportedEncodingException e) { } directory = new File(path); +} else if (!directory.exists()) { + LOG.warn(Plugins: directory not found: + name); + return null; } return directory; } Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1608136r1=1608135r2=1608136view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java Sat Jul 5 21:42:20 2014 @@ -71,6 +71,10 @@ public class PluginRepository { PluginManifestParser manifestParser = new PluginManifestParser(this.conf, this); MapString, PluginDescriptor allPlugins = manifestParser .parsePluginFolder(pluginFolders); +if (allPlugins.isEmpty()) { + LOG.warn(No plugins found on paths of property plugin.folders=\{}\, + conf.get(plugin.folders)); +} Pattern excludes = Pattern.compile(conf.get(plugin.excludes, )); Pattern includes = Pattern.compile(conf.get(plugin.includes, )); MapString, PluginDescriptor filteredPlugins = filter(excludes, includes, Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1608136r1=1608135r2=1608136view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Sat Jul 5 21:42:20 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1776 Log incorrect plugin.folder file path (Diaa via snagel) + * NUTCH-1566 bin/nutch to allow whitespace in paths (tejasp, snagel) * NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel) Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1608136r1=1608135r2=1608136view=diff == --- nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java Sat Jul 5 21:42:20 2014 @@ -134,6 +134,9 @@ public class PluginManifestParser { } catch (UnsupportedEncodingException e) { } directory = new File(path); +} else if (!directory.exists()) { + LOG.warn(Plugins: directory not found: + name); + return null; } return directory; } Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1608136r1=1608135r2=1608136view=diff == --- nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Sat Jul 5 21:42:20 2014 @@ -74,6 +74,10 @@ public class
svn commit: r1609568 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/src/bin/nutch trunk/CHANGES.txt trunk/src/bin/nutch
Author: snagel Date: Thu Jul 10 20:50:27 2014 New Revision: 1609568 URL: http://svn.apache.org/r1609568 Log: NUTCH-1811 bin/nutch junit to use junit 4 test runner Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/bin/nutch nutch/trunk/CHANGES.txt nutch/trunk/src/bin/nutch Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1609568r1=1609567r2=1609568view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Thu Jul 10 20:50:27 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1811 bin/nutch junit to use junit 4 test runner (snagel) + * NUTCH-1776 Log incorrect plugin.folder file path (Diaa via snagel) * NUTCH-1566 bin/nutch to allow whitespace in paths (tejasp, snagel) Modified: nutch/branches/2.x/src/bin/nutch URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/nutch?rev=1609568r1=1609567r2=1609568view=diff == --- nutch/branches/2.x/src/bin/nutch (original) +++ nutch/branches/2.x/src/bin/nutch Thu Jul 10 20:50:27 2014 @@ -238,7 +238,7 @@ elif [ $COMMAND = nutchserver ] ; th CLASS=org.apache.nutch.api.NutchServer elif [ $COMMAND = junit ] ; then CLASSPATH=$CLASSPATH:$NUTCH_HOME/test/classes/ - CLASS=junit.textui.TestRunner + CLASS=org.junit.runner.JUnitCore else CLASS=$COMMAND fi Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1609568r1=1609567r2=1609568view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jul 10 20:50:27 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1811 bin/nutch junit to use junit 4 test runner (snagel) + * NUTCH-1799 ANT Eclipse task discovers all plugin jars automatically (jnioche) * NUTCH-578 URL fetched with 403 is generated over and over again (snagel) Modified: nutch/trunk/src/bin/nutch URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1609568r1=1609567r2=1609568view=diff == --- nutch/trunk/src/bin/nutch (original) +++ nutch/trunk/src/bin/nutch Thu Jul 10 20:50:27 2014 @@ -262,7 +262,7 @@ elif [ $COMMAND = plugin ] ; then CLASS=org.apache.nutch.plugin.PluginRepository elif [ $COMMAND = junit ] ; then CLASSPATH=$CLASSPATH:$NUTCH_HOME/test/classes/ - CLASS=junit.textui.TestRunner + CLASS=org.junit.runner.JUnitCore else CLASS=$COMMAND fi
svn commit: r1614375 - in /nutch: branches/2.x/ branches/2.x/conf/ branches/2.x/src/java/org/apache/nutch/indexer/ branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic
Author: snagel Date: Tue Jul 29 15:13:20 2014 New Revision: 1614375 URL: http://svn.apache.org/r1614375 Log: NUTCH-1708 use same id when indexing and deleting redirects Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/schema.xml nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java nutch/trunk/CHANGES.txt nutch/trunk/conf/schema-solr4.xml nutch/trunk/conf/schema.xml nutch/trunk/conf/solrindex-mapping.xml nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1614375r1=1614374r2=1614375view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Tue Jul 29 15:13:20 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1708 use same id when indexing and deleting redirects (snagel) + * NUTCH-1817 Remove pom.xml from source (jnioche) * NUTCH-1811 bin/nutch junit to use junit 4 test runner (snagel) Modified: nutch/branches/2.x/conf/schema.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1614375r1=1614374r2=1614375view=diff == --- nutch/branches/2.x/conf/schema.xml (original) +++ nutch/branches/2.x/conf/schema.xml Tue Jul 29 15:13:20 2014 @@ -307,7 +307,7 @@ to include it as performance improvements are minimal. -- field name=_version_ type=long indexed=true stored=true/ -field name=id type=string stored=true indexed=true/ +field name=id type=string stored=true indexed=true required=true/ !-- core fields -- field name=batchId type=string stored=true indexed=false/ @@ -316,7 +316,7 @@ !-- fields for index-basic plugin -- field name=host type=url stored=false indexed=true/ -field name=url type=url stored=true indexed=true required=true/ +field name=url type=url stored=true indexed=true/ field name=orig type=url stored=true indexed=true / !-- stored=true for highlighting, use term vectors and positions for fast highlighting -- field name=content type=text_general stored=true indexed=true/ Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1614375r1=1614374r2=1614375view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Tue Jul 29 15:13:20 2014 @@ -123,6 +123,7 @@ public class IndexingFiltersChecker exte } NutchDocument doc = new NutchDocument(); +doc.add(id, url); doc.add(digest, StringUtil.toHexString(page.getSignature())); try { Modified: nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java?rev=1614375r1=1614374r2=1614375view=diff == --- nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java (original) +++ nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java Tue Jul 29 15:13:20 2014 @@ -121,7 +121,7 @@ public class ElasticIndexWriter implemen @Override public void write(NutchDocument doc) throws IOException { -String id = (String) doc.getFieldValue(url); +String id = (String) doc.getFieldValue(id); String type = doc.getDocumentMeta().get(type); if (type == null) type = doc; Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1614375r1=1614374r2=1614375view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Jul 29 15:13:20 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1708 use same id when indexing and deleting redirects (snagel) + * NUTCH-1818 Add deps-test-compile task
svn commit: r1618521 - /nutch/cms_site/trunk/content/index.md
Author: snagel Date: Sun Aug 17 20:24:29 2014 New Revision: 1618521 URL: http://svn.apache.org/r1618521 Log: CMS commit to nutch by snagel Modified: nutch/cms_site/trunk/content/index.md Modified: nutch/cms_site/trunk/content/index.md URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1618521r1=1618520r2=1618521view=diff == --- nutch/cms_site/trunk/content/index.md (original) +++ nutch/cms_site/trunk/content/index.md Sun Aug 17 20:24:29 2014 @@ -120,6 +120,13 @@ under the License. /div div class=jumbotron + h231 July 2014 - Nutch tutorial at upcoming a href=http://events.linuxfoundation.org/events/apachecon-europe;ApacheCon Europe/a in Budapest/h2 + pThe upcoming a href=http://events.linuxfoundation.org/events/apachecon-europe;ApacheCon Europe/a in Budapest, November 17 - 21, 2014, + will offer a one-day a href=http://sched.co/1pbE15n;Nutch tutorial/a. Topics will span from Nutch installation and configuration up to plugin + development. Both Nutch 1.x and 2.x are covered. The conference is a good opportunity to bring together both users and committers of Nutch and related projects./p + /div + + div class=jumbotron h201 May 2014 - Apache Nutch Participates in a href=https://www.google-melange.com/gsoc/homepage/google/gsoc2014;Google Summer of Code/a/h2 a title=ApacheCon US 2009 href=http://www.us.apachecon.com/c/acus2009/; img src=http://typo3.org/fileadmin/t3org/images/FM-news/2014/thisweek/920x156xbanner-gsoc2014.jpg; class=float-right alt=GSoC Logo/
svn commit: r919651 - /websites/production/nutch/content/
Author: snagel Date: Sun Aug 17 20:26:24 2014 New Revision: 919651 Log: announce tutorial at ApacheCon Europe in Budapest Added: websites/production/nutch/content/ - copied from r919650, websites/staging/nutch/trunk/content/
svn commit: r1619934 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/crawl/ trunk/ trunk/src/java/org/apache/nutch/crawl/
Author: snagel Date: Fri Aug 22 21:23:32 2014 New Revision: 1619934 URL: http://svn.apache.org/r1619934 Log: NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, generate.max.per.host.by.ip Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1619934r1=1619933r2=1619934view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri Aug 22 21:23:32 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, generate.max.per.host.by.ip (Matthias Agethle via snagel) + * NUTCH-1819 batchId in GeneratorJob ( Fjodor Vershinin via lewismc) * NUTCH-1708 use same id when indexing and deleting redirects (snagel) Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1619934r1=1619933r2=1619934view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Fri Aug 22 21:23:32 2014 @@ -60,12 +60,8 @@ implements FetchSchedule { public void setConf(Configuration conf) { super.setConf(conf); if (conf == null) return; -int oldDefaultInterval = conf.getInt(db.default.fetch.interval, 0); defaultInterval = conf.getInt(db.fetch.interval.default, 0); -if (oldDefaultInterval 0 defaultInterval == 0) defaultInterval = oldDefaultInterval * SECONDS_PER_DAY; -int oldMaxInterval = conf.getInt(db.max.fetch.interval, 0); maxInterval = conf.getInt(db.fetch.interval.max, 0 ); -if (oldMaxInterval 0 maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY; LOG.info(defaultInterval= + defaultInterval); LOG.info(maxInterval= + maxInterval); } Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1619934r1=1619933r2=1619934view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Aug 22 21:23:32 2014 @@ -2,7 +2,7 @@ Nutch Change Log Nutch Current Development -* NUTCH-XX +* NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, generate.max.per.host.by.ip (Matthias Agethle via snagel) Nutch 1.9 Release Change Log - 12/08/2014 (dd/mm/) Release Report - http://s.apache.org/1.9-release Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=1619934r1=1619933r2=1619934view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Fri Aug 22 21:23:32 2014 @@ -50,9 +50,7 @@ public class CrawlDbReducer implements R retryMax = job.getInt(db.fetch.retry.max, 3); scfilters = new ScoringFilters(job); additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true); -int oldMaxInterval = job.getInt(db.max.fetch.interval, 0); maxInterval = job.getInt(db.fetch.interval.max, 0 ); -if (oldMaxInterval 0 maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY; schedule = FetchScheduleFactory.getFetchSchedule(job); int maxLinks = job.getInt(db.update.max.inlinks, 1); linked = new InlinkPriorityQueue(maxLinks); Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1619934r1=1619933r2=1619934view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Aug 22 21:23:32 2014 @@ -74,9 +74,6 @@ public class Generator extends Configure public static final String GENERATOR_DELAY = crawl.gen.delay; public static final String GENERATOR_MAX_NUM_SEGMENTS = generate.max.num.segments; - // deprecated parameters - public static final String GENERATE_MAX_PER_HOST_BY_IP = generate.max.per.host.by.ip; - public static class SelectorEntry implements Writable
svn commit: r1619942 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/crawl/ branches/2.x/src/java/org/apache/nutch/parse/ trunk/ trunk/src/java/org/apache/nutch/crawl/
Author: snagel Date: Fri Aug 22 22:23:27 2014 New Revision: 1619942 URL: http://svn.apache.org/r1619942 Log: NUTCH-1693 TextMD5Signature computed on textual content Added: nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java (with props) nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java (with props) Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java nutch/trunk/CHANGES.txt Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1619942r1=1619941r2=1619942view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri Aug 22 22:23:27 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, markus via snagel) + * NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, generate.max.per.host.by.ip (Matthias Agethle via snagel) * NUTCH-1819 batchId in GeneratorJob ( Fjodor Vershinin via lewismc) Added: nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1619942view=auto == --- nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java (added) +++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java Fri Aug 22 22:23:27 2014 @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.util.Collection; +import java.util.HashSet; + +import org.apache.hadoop.io.MD5Hash; +import org.apache.nutch.storage.WebPage; + +/** + * Default implementation of a page signature. It calculates an MD5 hash of the + * textual content of a page. In case there is no text, it calculates a hash + * from the page's fetched content. + */ +public class TextMD5Signature extends Signature { + + private final static CollectionWebPage.Field FIELDS = new HashSetWebPage.Field(); + + static { +FIELDS.add(WebPage.Field.TEXT); + } + + Signature fallback = new MD5Signature(); + + @Override + public byte[] calculate(WebPage page) { +CharSequence text = page.getText(); + +if (text == null || text.length() == 0) { + return fallback.calculate(page); +} + +return MD5Hash.digest(text.toString()).getDigest(); + } + + @Override + public CollectionWebPage.Field getFields() { +CollectionWebPage.Field fields = new HashSetWebPage.Field(FIELDS); +fields.addAll(fallback.getFields()); +return fields; + } +} Propchange: nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java -- svn:eol-style = native Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1619942r1=1619941r2=1619942view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Fri Aug 22 22:23:27 2014 @@ -187,8 +187,6 @@ public class ParseUtil extends Configure return; } -final byte[] signature = sig.calculate(page); - org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus(); page.setParseStatus(pstatus); if (ParseStatusUtils.isSuccess(pstatus)) { @@ -233,6 +231,7 @@ public class ParseUtil extends Configure if (prevSig != null) { page.setPrevSignature(prevSig); } +final byte[] signature = sig.calculate(page); page.setSignature(ByteBuffer.wrap(signature)); if (page.getOutlinks() != null) { page.getOutlinks().clear(); Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev
svn commit: r1619944 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFilter.java
Author: snagel Date: Fri Aug 22 22:28:12 2014 New Revision: 1619944 URL: http://svn.apache.org/r1619944 Log: NUTCH-1775 IndexingFilter: document origin of passed CrawlDatum Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1619944r1=1619943r2=1619944view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Aug 22 22:28:12 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1775 IndexingFilter: document origin of passed CrawlDatum (snagel) + * NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, markus via snagel) * NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, generate.max.per.host.by.ip (Matthias Agethle via snagel) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=1619944r1=1619943r2=1619944view=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Fri Aug 22 22:28:12 2014 @@ -39,15 +39,22 @@ public interface IndexingFilter extends /** * Adds fields or otherwise modifies the document that will be indexed for a - * parse. Unwanted documents can be removed from indexing by returning a null value. + * parse. Unwanted documents can be removed from indexing by returning a null + * value. * - * @param doc document instance for collecting fields - * @param parse parse data instance - * @param url page url - * @param datum crawl datum for the page - * @param inlinks page inlinks - * @return modified (or a new) document instance, or null (meaning the document - * should be discarded) + * @param doc + * document instance for collecting fields + * @param parse + * parse data instance + * @param url + * page url + * @param datum + * crawl datum for the page (fetch datum from segment containing + * fetch status and fetch time) + * @param inlinks + * page inlinks + * @return modified (or a new) document instance, or null (meaning the + * document should be discarded) * @throws IndexingException */ NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
svn commit: r1625821 - in /nutch/cms_site/trunk/content/apidocs/apidocs-1.9: ./ org/ org/apache/ org/apache/nutch/ org/apache/nutch/analysis/ org/apache/nutch/analysis/lang/ org/apache/nutch/analysis/
Author: snagel Date: Wed Sep 17 20:52:17 2014 New Revision: 1625821 URL: http://svn.apache.org/r1625821 Log: add 1.9 Java apidocs [This commit notification would consist of 137 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.]
svn commit: r1625826 - /nutch/cms_site/trunk/content/javadoc.md
Author: snagel Date: Wed Sep 17 21:07:29 2014 New Revision: 1625826 URL: http://svn.apache.org/r1625826 Log: add apidoc 1.9 Modified: nutch/cms_site/trunk/content/javadoc.md Modified: nutch/cms_site/trunk/content/javadoc.md URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/javadoc.md?rev=1625826r1=1625825r2=1625826view=diff == --- nutch/cms_site/trunk/content/javadoc.md (original) +++ nutch/cms_site/trunk/content/javadoc.md Wed Sep 17 21:07:29 2014 @@ -41,7 +41,7 @@ under the License. h2Current Releases Javadoc/h2 ul lia href=./apidocs/apidocs-2.2.1/index.html2.2.1 (2.X Branch)/a/li -lia href=./apidocs/apidocs-1.8/index.html1.8 (1.X branch)/a/li +lia href=./apidocs/apidocs-1.9/index.html1.9 (1.X branch)/a/li /ul /div /section @@ -55,6 +55,7 @@ under the License. lia href=./apidocs/apidocs-2.2/index.html2.2/a/li lia href=./apidocs/apidocs-2.1/index.html2.1/a/li lia href=./apidocs/apidocs-2.0/index.html2.0/a/li + lia href=./apidocs/apidocs-1.8/index.html1.8/a/li lia href=./apidocs/apidocs-1.7/index.html1.7/a/li lia href=./apidocs/apidocs-1.6/index.html1.6/a/li lia href=./apidocs/apidocs-1.5/index.html1.5/a/li
svn commit: r922601 - /websites/production/nutch/content/
Author: snagel Date: Wed Sep 17 21:08:05 2014 New Revision: 922601 Log: add Java apidoc 1.9 Added: websites/production/nutch/content/ - copied from r922599, websites/staging/nutch/trunk/content/
svn commit: r922608 - /websites/production/nutch/content/
Author: snagel Date: Wed Sep 17 21:32:43 2014 New Revision: 922608 Log: update Java apidoc 1.9 Added: websites/production/nutch/content/ - copied from r922607, websites/staging/nutch/trunk/content/
svn commit: r1626581 - in /nutch: branches/2.x/KEYS branches/2.x/ivy/mvn.template trunk/KEYS trunk/ivy/mvn.template
Author: snagel Date: Sun Sep 21 14:18:26 2014 New Revision: 1626581 URL: http://svn.apache.org/r1626581 Log: add committer snagel Modified: nutch/branches/2.x/KEYS nutch/branches/2.x/ivy/mvn.template nutch/trunk/KEYS nutch/trunk/ivy/mvn.template Modified: nutch/branches/2.x/KEYS URL: http://svn.apache.org/viewvc/nutch/branches/2.x/KEYS?rev=1626581r1=1626580r2=1626581view=diff == --- nutch/branches/2.x/KEYS (original) +++ nutch/branches/2.x/KEYS Sun Sep 21 14:18:26 2014 @@ -391,3 +391,39 @@ mig+aEkS1Y2uDm5fXNPICB/eKaZ02rVt1hzA5acj lTN6fp142rTJsWIbmePnY+jJhrGpAtdYx+2hFH0c5CPQ =cK1G -END PGP PUBLIC KEY BLOCK- +pub 2048R/DB0A9C6D 2012-11-06 +uid Sebastian Nagel sna...@apache.org +sig 3DB0A9C6D 2012-11-06 Sebastian Nagel sna...@apache.org +sub 2048R/9E631E2F 2012-11-06 +sig DB0A9C6D 2012-11-06 Sebastian Nagel sna...@apache.org + +-BEGIN PGP PUBLIC KEY BLOCK- +Version: GnuPG v1 + +mQENBFCY+9kBCADcgOPlGGBOPuldasyoEXoJwt8ACbRqZbeIQhS3YIIPFFRTJvns +GlMAGyiSbMRkfi2uFz1g3u0uiMatD3CEHgV4wqu1d/ALmUKhGJ4VluEzjyRtRgUk +OD0Xw52Rcp27GNdAwpEojWSy764PbGotNsRvqehYnu+iVBVpxRc//vfPjq/nt5xk +BwTsR/o/EKulvFB6B1x/bySNZuJZksdpEOxA6s30Ig25nXA//9DrU3Vse40cz1LI +wx2rCsqU15SxAabqXMdNeQD7gmlE+toPp+ziu3drX8U6iEYwC71RCnc8LtxXt/Aq +cSnzlAmUmKNZpHM8AqKbW+IVH8iXN/LtKQLxABEBAAG0I1NlYmFzdGlhbiBOYWdl +bCA8c25hZ2VsQGFwYWNoZS5vcmc+iQE4BBMBAgAiBQJQmPvZAhsDBgsJCAcDAgYV +CAIJCgsEFgIDAQIeAQIXgAAKCRDGbqe32wqcbWpDCAC5r0Wy1vZ5luLR2IhqDH9a +q7FLqpLL66LUBPX3mof9hTV3456uraozdmVyO39X3IvXlrQavt2ubreWxMBT+g2z +hBiQHy5RkjCqYXXz5Gkkxv4rggjcHgwKZDN2gK0VP+7rUqg0JPW9hQPcEdlAqBi0 +i9knjoAwKEpypvo202KWF6JAeUiEOAXAdhcm98uzJN17GRY/N9+3ELbBy6zgbQpl +6GYNNyS+vW+40aKThrOe7lvjBERN6v01yZ3QjQBfFkaefNjnXOaiR+JkQ5m99DBB +vbGcQPGx5xpGGGjMbbfB4LT57/F0FxEzO+PgNph2vzEwM8+4BWMaEhxBGdvwuNW/ +uQENBFCY+9kBCAC+LI5lWBrOdCCbd62q4sIjqyJEMzFjozd73aDc3xA1dCrsIz02 ++eg0LSmvt3DdPoDTMVLXs4GYM48U139Qy4o4T9gRNhFS+Im+OnI9CLKX29qy7hEu +xxa1ByA5pZgJ+21wmCBjTVK9Jcf1JeDcAr2L3qVFY2+Hhvh3eOlNov9NzQJpmOv3 +9JRiia4Xm0h9AffL9P+AKlGRuCPfnsPu5JzEuo/wqQ+oIunYJFiCNbU/CaP1PK0x +S0taB8fsYu/UBh1+bzc0xQvHWYmupqc62qGk8N+useZiKn+4BBvhTU6fykCrZTVH +jGNi9qpPRy6bPpy9yknVRJDkrUGHiq8VqGA1ABEBAAGJAR8EGAECAAkFAlCY+9kC +GwwACgkQxm6nt9sKnG1r2ggAxCjb8IoAjVddbEduTkWCFkqtpCjGFj/J1TetooqI +SvKBeRDZyJ+kA10BIGg6DudYCApo7ObZN6EhxwhVDuPa7nxacrKVgJyhztmFDT8X +zfhxQ8ytVWayHnvesmwolquIQtqRPfIvB/AwGZ9PjfJFMC6A229tTBhAgva4h8GE +EEE6JEV4AIQRAcoisr4chzq/9xm19TEjYMvtE92QBiTYhu7uTfUQbnyP3uN4bLEm +xY2l8d0700NQh27drc5An1wWeYZj+4HrFhnOXktODwi+8W3WNOGr71L1XPttW+G5 +ZnAbVvXpIOwmMCrU7YaCahFry/H+I7G+gWZ4mvujVMgoqw== +=RlMy +-END PGP PUBLIC KEY BLOCK- Modified: nutch/branches/2.x/ivy/mvn.template URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/mvn.template?rev=1626581r1=1626580r2=1626581view=diff == --- nutch/branches/2.x/ivy/mvn.template (original) +++ nutch/branches/2.x/ivy/mvn.template Sun Sep 21 14:18:26 2014 @@ -78,6 +78,11 @@ nameTalat Uyarer/name emailta...@apache.org/email /developer + developer + idsnagel/id + nameSebastian Nagel/name + emailsna...@apache.org/email + /developer /developers build sourceDirectorysrc/java/sourceDirectory Modified: nutch/trunk/KEYS URL: http://svn.apache.org/viewvc/nutch/trunk/KEYS?rev=1626581r1=1626580r2=1626581view=diff == --- nutch/trunk/KEYS (original) +++ nutch/trunk/KEYS Sun Sep 21 14:18:26 2014 @@ -299,3 +299,39 @@ NPgXHx4ASqesjF/9GUrAQfOmXqHdOF6xOb7YYGss U3Wt+q9F6O+RmemV6a6mrpog+Aq+BkIMWCJ8 =xHbT -END PGP PUBLIC KEY BLOCK- +pub 2048R/DB0A9C6D 2012-11-06 +uid Sebastian Nagel sna...@apache.org +sig 3DB0A9C6D 2012-11-06 Sebastian Nagel sna...@apache.org +sub 2048R/9E631E2F 2012-11-06 +sig DB0A9C6D 2012-11-06 Sebastian Nagel sna...@apache.org + +-BEGIN PGP PUBLIC KEY BLOCK- +Version: GnuPG v1 + +mQENBFCY+9kBCADcgOPlGGBOPuldasyoEXoJwt8ACbRqZbeIQhS3YIIPFFRTJvns +GlMAGyiSbMRkfi2uFz1g3u0uiMatD3CEHgV4wqu1d/ALmUKhGJ4VluEzjyRtRgUk +OD0Xw52Rcp27GNdAwpEojWSy764PbGotNsRvqehYnu+iVBVpxRc//vfPjq/nt5xk +BwTsR/o/EKulvFB6B1x/bySNZuJZksdpEOxA6s30Ig25nXA//9DrU3Vse40cz1LI +wx2rCsqU15SxAabqXMdNeQD7gmlE+toPp+ziu3drX8U6iEYwC71RCnc8LtxXt/Aq +cSnzlAmUmKNZpHM8AqKbW+IVH8iXN/LtKQLxABEBAAG0I1NlYmFzdGlhbiBOYWdl +bCA8c25hZ2VsQGFwYWNoZS5vcmc+iQE4BBMBAgAiBQJQmPvZAhsDBgsJCAcDAgYV +CAIJCgsEFgIDAQIeAQIXgAAKCRDGbqe32wqcbWpDCAC5r0Wy1vZ5luLR2IhqDH9a +q7FLqpLL66LUBPX3mof9hTV3456uraozdmVyO39X3IvXlrQavt2ubreWxMBT+g2z +hBiQHy5RkjCqYXXz5Gkkxv4rggjcHgwKZDN2gK0VP+7rUqg0JPW9hQPcEdlAqBi0 +i9knjoAwKEpypvo202KWF6JAeUiEOAXAdhcm98uzJN17GRY/N9+3ELbBy6zgbQpl +6GYNNyS+vW
svn commit: r1629076 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Author: snagel Date: Thu Oct 2 21:37:04 2014 New Revision: 1629076 URL: http://svn.apache.org/r1629076 Log: NUTCH-1826 indexchecker fails if solr.server.url not configured Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1629076r1=1629075r2=1629076view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Oct 2 21:37:04 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1826, NUTCH-1864 indexchecker fails if solr.server.url not configured (lewismc, snagel) + * NUTCH-1866 ant eclipse target should not delete runtime (nimafl vai lewismc) * NUTCH-1857 readb -dump -format csv should use comma (lewismc) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1629076r1=1629075r2=1629076view=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Thu Oct 2 21:37:04 2014 @@ -95,8 +95,6 @@ public class IndexingFiltersChecker exte ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum); -IndexWriters writers = new IndexWriters(getConf()); - if (!output.getStatus().isSuccess()) { System.out.println(Fetch failed with protocol status: + output.getStatus()); return 0; @@ -166,9 +164,10 @@ public class IndexingFiltersChecker exte } if (conf.getBoolean(doIndex, false) doc!=null){ - writers.open(new JobConf(getConf()), IndexingFilterChecker); - writers.write(doc); - writers.close(); + IndexWriters writers = new IndexWriters(getConf()); + writers.open(new JobConf(getConf()), IndexingFilterChecker); + writers.write(doc); + writers.close(); } return 0;
svn commit: r1630565 - in /nutch/trunk: ./ src/plugin/ src/plugin/protocol-http/ src/plugin/protocol-http/jsp/ src/plugin/protocol-http/src/test/conf/ src/plugin/protocol-http/src/test/org/apache/nutc
Author: snagel Date: Thu Oct 9 19:20:51 2014 New Revision: 1630565 URL: http://svn.apache.org/r1630565 Log: NUTCH-1164 JUnit tests for protocol-http Added: nutch/trunk/src/plugin/protocol-http/jsp/ nutch/trunk/src/plugin/protocol-http/jsp/basic-http.jsp (with props) nutch/trunk/src/plugin/protocol-http/jsp/brokenpage.jsp (with props) nutch/trunk/src/plugin/protocol-http/jsp/redirect301.jsp (with props) nutch/trunk/src/plugin/protocol-http/jsp/redirect302.jsp (with props) nutch/trunk/src/plugin/protocol-http/src/test/conf/ nutch/trunk/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml (with props) nutch/trunk/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java (with props) Modified: nutch/trunk/CHANGES.txt nutch/trunk/build.xml nutch/trunk/src/plugin/build.xml nutch/trunk/src/plugin/protocol-http/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1630565r1=1630564r2=1630565view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Oct 9 19:20:51 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1164 Write JUnit tests for protocol-http (nimafl via snagel) + * NUTCH-1868 Document and improve CLI for FileDumper tool (lewismc) * NUTCH-1869 Add a flag to -mimeType fiag to FileDumper (lewismc) @@ -10,7 +12,7 @@ Nutch Current Development 1.10-SNAPSHOT * NUTCH-1826, NUTCH-1864 indexchecker fails if solr.server.url not configured (lewismc, snagel) -* NUTCH-1866 ant eclipse target should not delete runtime (nimafl vai lewismc) +* NUTCH-1866 ant eclipse target should not delete runtime (nimafl via lewismc) * NUTCH-1857 readb -dump -format csv should use comma (lewismc) Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1630565r1=1630564r2=1630565view=diff == --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Thu Oct 9 19:20:51 2014 @@ -992,7 +992,7 @@ source path=${plugins.dir}/protocol-httpclient/src/java/ / source path=${plugins.dir}/protocol-httpclient/src/test/ / source path=${plugins.dir}/protocol-http/src/java/ / -!-- source path=${plugins.dir}/protocol-http/src/test/ / -- +source path=${plugins.dir}/protocol-http/src/test/ / source path=${plugins.dir}/scoring-depth/src/java/ / source path=${plugins.dir}/scoring-link/src/java/ / source path=${plugins.dir}/scoring-opic/src/java/ / Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1630565r1=1630564r2=1630565view=diff == --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Thu Oct 9 19:20:51 2014 @@ -88,6 +88,7 @@ ant dir=language-identifier target=test/ ant dir=lib-http target=test/ ant dir=protocol-file target=test/ + ant dir=protocol-http target=test/ ant dir=protocol-httpclient target=test/ !--ant dir=parse-ext target=test/-- ant dir=feed target=test/ Modified: nutch/trunk/src/plugin/protocol-http/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/build.xml?rev=1630565r1=1630564r2=1630565view=diff == --- nutch/trunk/src/plugin/protocol-http/build.xml (original) +++ nutch/trunk/src/plugin/protocol-http/build.xml Thu Oct 9 19:20:51 2014 @@ -29,12 +29,22 @@ fileset dir=${nutch.root}/build include name=**/lib-http/*.jar / /fileset +pathelement location=${build.dir}/test/conf/ /path !-- Deploy Unit test dependencies -- target name=deps-test ant target=deploy inheritall=false dir=../lib-http/ ant target=deploy inheritall=false dir=../nutch-extensionpoints/ +copy toDir=${build.test} + fileset dir=${src.test} excludes=**/*.java/ +/copy /target + !-- for junit test -- + mkdir dir=${build.test}/data / + copy todir=${build.test}/data + fileset dir=jsp/ + /copy + /project Added: nutch/trunk/src/plugin/protocol-http/jsp/basic-http.jsp URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/jsp/basic-http.jsp?rev=1630565view=auto == --- nutch/trunk/src/plugin/protocol-http/jsp/basic-http.jsp (added) +++ nutch/trunk/src/plugin/protocol-http/jsp/basic-http.jsp Thu Oct 9 19:20:51 2014 @@ -0,0 +1,44 @@ +%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional
svn commit: r1633222 - in /nutch/branches/2.x: ./ conf/ src/java/org/apache/nutch/parse/ src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/ src/plugin/parse-html/src/java/org/apache
Author: snagel Date: Mon Oct 20 20:44:00 2014 New Revision: 1633222 URL: http://svn.apache.org/r1633222 Log: NUTCH-1827 Port issues 1467 and 1561 to 2.x Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/nutch-default.xml nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1633222r1=1633221r2=1633222view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Mon Oct 20 20:44:00 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development 2.3-SNAPSHOT +* NUTCH-1827 Port NUTCH-1467 and NUTCH-1561 to 2.x (snagel) + * NUTCH-1876 Upgrade to Crawler Commons 0.5 (jnioche) * NUTCH-1866 ant eclipse target should not delete runtime (nimafl via lewismc) Modified: nutch/branches/2.x/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1633222r1=1633221r2=1633222view=diff == --- nutch/branches/2.x/conf/nutch-default.xml (original) +++ nutch/branches/2.x/conf/nutch-default.xml Mon Oct 20 20:44:00 2014 @@ -1125,7 +1125,7 @@ description Comma-separated list of keys to be taken from the metadata to generate fields. Can be used e.g. for 'description' or 'keywords' provided that these values are generated - by a parser (see parse-metatags plugin) + by a parser (see parse-metatags plugin), and property 'metatags.names'. /description /property @@ -1133,11 +1133,12 @@ property namemetatags.names/name value*/value - description Names of the metatags to extract, separated by ';'. - Use '*' to extract all metatags. Prefixes the names with 'meta_' - in the parse-metadata. For instance to index description and keywords, - you need to activate the plugin index-metadata and set the value of the - parameter 'index.metadata' to 'meta_description;meta_keywords'. + descriptionNames of the metatags to extract, separated by ','. + Use '*' to extract all metatags. Prefixes the names with 'meta_' in + the parse-metadata. For instance, to index description and keywords, + you need to activate the plugins parse-metadata and index-metadata + and set the value of the properties 'metatags.names' and + 'index.metadata' to 'description,keywords'. /description /property Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=1633222r1=1633221r2=1633222view=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java Mon Oct 20 20:44:00 2014 @@ -21,6 +21,8 @@ import java.net.URL; import java.util.Iterator; import java.util.Properties; +import org.apache.nutch.metadata.Metadata; + /** * This class holds the information about HTML meta tags extracted from * a page. Some special tags have convenience methods for easy checking. @@ -40,7 +42,7 @@ public class HTMLMetaTags { private URL refreshHref = null; - private Properties generalTags = new Properties(); + private Metadata generalTags = new Metadata(); private Properties httpEquivTags = new Properties(); @@ -166,7 +168,7 @@ public class HTMLMetaTags { * Returns all collected values of the general meta tags. Property names are * tag names, property values are content values. */ - public Properties getGeneralTags() { + public Metadata getGeneralTags() { return generalTags; } @@ -188,13 +190,13 @@ public class HTMLMetaTags { + , refreshHref= + refreshHref + \n ); sb.append( * general tags:\n); -Iterator? it = generalTags.keySet().iterator(); -while (it.hasNext()) { - String key = (String)it.next(); +String[] names = generalTags.names(); +for (String name : names) { + String key = name; sb.append( - + key + \t=\t + generalTags.get(key) + \n); } sb.append( * http-equiv tags:\n); -it = httpEquivTags.keySet().iterator(); +IteratorObject it = httpEquivTags.keySet().iterator(); while
svn commit: r1633426 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/build.xml trunk/CHANGES.txt trunk/build.xml
Author: snagel Date: Tue Oct 21 17:52:27 2014 New Revision: 1633426 URL: http://svn.apache.org/r1633426 Log: NUTCH-1882 ant eclipse target to add output path to src/test Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/build.xml nutch/trunk/CHANGES.txt nutch/trunk/build.xml Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1633426r1=1633425r2=1633426view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Tue Oct 21 17:52:27 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development 2.3-SNAPSHOT +* NUTCH-1882 ant eclipse target to add output path to src/test (snagel) + * NUTCH-1827 Port NUTCH-1467 and NUTCH-1561 to 2.x (snagel) * NUTCH-1876 Upgrade to Crawler Commons 0.5 (jnioche) Modified: nutch/branches/2.x/build.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1633426r1=1633425r2=1633426view=diff == --- nutch/branches/2.x/build.xml (original) +++ nutch/branches/2.x/build.xml Tue Oct 21 17:52:27 2014 @@ -954,7 +954,7 @@ library pathref=eclipse.classpath exported=false / source path=${basedir}/src/java/ / -source path=${basedir}/src/test/ / +source path=${basedir}/src/test/ output=build/test/classes / source path=${basedir}/src/plugin/creativecommons/src/java/ / source path=${basedir}/src/plugin/creativecommons/src/test/ / !-- feed is currently disabled Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1633426r1=1633425r2=1633426view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Oct 21 17:52:27 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1882 ant eclipse target to add output path to src/test (snagel) + * NUTCH-1876 Upgrade to Crawler Commons 0.5 (jnioche) * NUTCH-1874 FileDumper comment typos ( Arthur Cinader via lewismc) Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1633426r1=1633425r2=1633426view=diff == --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Tue Oct 21 17:52:27 2014 @@ -947,7 +947,7 @@ library pathref=eclipse.classpath exported=false / source path=${basedir}/src/java/ / -source path=${basedir}/src/test/ / +source path=${basedir}/src/test/ output=build/test/classes / source path=${plugins.dir}/creativecommons/src/java/ / source path=${plugins.dir}/creativecommons/src/test/ /
svn commit: r1634694 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/src/bin/crawl trunk/CHANGES.txt trunk/src/bin/crawl
Author: snagel Date: Mon Oct 27 21:38:50 2014 New Revision: 1634694 URL: http://svn.apache.org/r1634694 Log: NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/bin/crawl nutch/trunk/CHANGES.txt nutch/trunk/src/bin/crawl Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1634694r1=1634693r2=1634694view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Mon Oct 27 21:38:50 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development 2.3-SNAPSHOT +* NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value (snagel) + * NUTCH-1882 ant eclipse target to add output path to src/test (snagel) * NUTCH-1827 Port NUTCH-1467 and NUTCH-1561 to 2.x (snagel) Modified: nutch/branches/2.x/src/bin/crawl URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1634694r1=1634693r2=1634694view=diff == --- nutch/branches/2.x/src/bin/crawl (original) +++ nutch/branches/2.x/src/bin/crawl Mon Oct 27 21:38:50 2014 @@ -30,7 +30,7 @@ elif [ $# -eq 4 ]; then LIMIT=$4 else echo Unknown # of arguments $# -echo Usage: crawl seedDir crawlDir [solrUrl] numberOfRounds +echo Usage: crawl seedDir crawlID [solrUrl] numberOfRounds exit -1; fi @@ -40,7 +40,7 @@ if [ $SEEDDIR = ]; then fi if [ $CRAWL_ID = ]; then -echo Missing crawlDir : crawl seedDir crawlID [solrURL] numberOfRounds +echo Missing crawlID : crawl seedDir crawlID [solrURL] numberOfRounds exit -1; fi @@ -98,16 +98,30 @@ if [ $mode = distributed ]; then fi fi -# initial injection -$bin/nutch inject $SEEDDIR -crawlId $CRAWL_ID -RETCODE=$? -if [ $RETCODE -ne 0 ] - then exit $RETCODE -fi +function __bin_nutch { +# run $bin/nutch, exit if exit value indicates error + +echo $bin/nutch $@ ;# echo command and arguments +$bin/nutch $@ + +RETCODE=$? +if [ $RETCODE -ne 0 ] +then +echo Error running: +echo $bin/nutch $@ +echo Failed with exit value $RETCODE. +exit $RETCODE +fi +} +# initial injection +echo Injecting seed URLs +__bin_nutch inject $SEEDDIR -crawlId $CRAWL_ID + + # main loop : rounds of generate - fetch - parse - update for ((a=1; a = LIMIT ; a++)) do @@ -123,58 +137,28 @@ do batchId=`date +%s`-$RANDOM echo Generating a new fetchlist - $bin/nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId $CRAWL_ID -batchId $batchId - RETCODE=$? - - if [ $RETCODE -ne 0 ] -then exit $RETCODE - fi + __bin_nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId $CRAWL_ID -batchId $batchId echo Fetching : - $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId $CRAWL_ID -threads 50 - RETCODE=$? - - if [ $RETCODE -ne 0 ] -then exit $RETCODE - fi + __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId $CRAWL_ID -threads 50 # parsing the batch echo Parsing : # enable the skipping of records for the parsing so that a dodgy document # so that it does not fail the full task skipRecordsOptions=-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1 - $bin/nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId $CRAWL_ID - RETCODE=$? - - if [ $RETCODE -ne 0 ] -then exit $RETCODE - fi + __bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId $CRAWL_ID # updatedb with this batch echo CrawlDB update for $CRAWL_ID - $bin/nutch updatedb $commonOptions $batchId -crawlId $CRAWL_ID - RETCODE=$? - - if [ $RETCODE -ne 0 ] -then exit $RETCODE - fi + __bin_nutch updatedb $commonOptions $batchId -crawlId $CRAWL_ID if [ -n $SOLRURL ]; then echo Indexing $CRAWL_ID on SOLR index - $SOLRURL -$bin/nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId $CRAWL_ID -RETCODE=$? - -if [ $RETCODE -ne 0 ] - then exit $RETCODE -fi +__bin_nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId $CRAWL_ID echo SOLR dedup - $SOLRURL -$bin/nutch solrdedup $commonOptions $SOLRURL -RETCODE=$? - -if [ $RETCODE -ne 0 ] - then exit $RETCODE -fi +__bin_nutch solrdedup $commonOptions $SOLRURL else echo Skipping indexing tasks: no SOLR url provided. fi Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1634694r1=1634693r2=1634694view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk
svn commit: r1638203 - in /nutch: branches/2.x/src/bin/crawl trunk/src/bin/crawl
Author: snagel Date: Tue Nov 11 16:20:01 2014 New Revision: 1638203 URL: http://svn.apache.org/r1638203 Log: NUTCH-1883 in case of generate: break loop and do not exit with error Modified: nutch/branches/2.x/src/bin/crawl nutch/trunk/src/bin/crawl Modified: nutch/branches/2.x/src/bin/crawl URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1638203r1=1638202r2=1638203view=diff == --- nutch/branches/2.x/src/bin/crawl (original) +++ nutch/branches/2.x/src/bin/crawl Tue Nov 11 16:20:01 2014 @@ -137,7 +137,22 @@ do batchId=`date +%s`-$RANDOM echo Generating a new fetchlist - __bin_nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId $CRAWL_ID -batchId $batchId + generate_args=($commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId $CRAWL_ID -batchId $batchId) + echo $bin/nutch generate ${generate_args[@]} + $bin/nutch generate ${generate_args[@]} + RETCODE=$? + if [ $RETCODE -eq 0 ]; then + : # ok: no error + elif [ $RETCODE -eq 1 ]; then +echo Generate returned 1 (no new segments created) +echo Escaping loop: no more URLs to fetch now +break + else +echo Error running: +echo $bin/nutch generate ${generate_args[@]} +echo Failed with exit value $RETCODE. +exit $RETCODE + fi echo Fetching : __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId $CRAWL_ID -threads 50 Modified: nutch/trunk/src/bin/crawl URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1638203r1=1638202r2=1638203view=diff == --- nutch/trunk/src/bin/crawl (original) +++ nutch/trunk/src/bin/crawl Tue Nov 11 16:20:01 2014 @@ -133,7 +133,22 @@ do echo `date` : Iteration $a of $LIMIT echo Generating a new segment - __bin_nutch generate $commonOptions $CRAWL_PATH/crawldb $CRAWL_PATH/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter + generate_args=($commonOptions $CRAWL_PATH/crawldb $CRAWL_PATH/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter) + echo $bin/nutch generate ${generate_args[@]} + $bin/nutch generate ${generate_args[@]} + RETCODE=$? + if [ $RETCODE -eq 0 ]; then + : # ok: no error + elif [ $RETCODE -eq 1 ]; then +echo Generate returned 1 (no new segments created) +echo Escaping loop: no more URLs to fetch now +break + else +echo Error running: +echo $bin/nutch generate ${generate_args[@]} +echo Failed with exit value $RETCODE. +exit $RETCODE + fi # capture the name of the segment # call hadoop in distributed mode @@ -168,7 +183,7 @@ do __bin_nutch invertlinks $CRAWL_PATH/linkdb $CRAWL_PATH/segments/$SEGMENT echo Dedup on crawldb - $bin/nutch dedup $CRAWL_PATH/crawldb + __bin_nutch dedup $CRAWL_PATH/crawldb if [ -n $SOLRURL ]; then echo Indexing $SEGMENT on SOLR index - $SOLRURL
svn commit: r1643412 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/conf/suffix-urlfilter.txt.template trunk/CHANGES.txt trunk/conf/suffix-urlfilter.txt.template
Author: snagel Date: Fri Dec 5 19:53:35 2014 New Revision: 1643412 URL: http://svn.apache.org/r1643412 Log: NUTCH-1877 Suffix URL filter to ignore query string by default Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/suffix-urlfilter.txt.template nutch/trunk/CHANGES.txt nutch/trunk/conf/suffix-urlfilter.txt.template Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1643412r1=1643411r2=1643412view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri Dec 5 19:53:35 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development 2.3-SNAPSHOT +* NUTCH-1877 Suffix URL filter to ignore query string by default (markus via snagel) + * NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel) * NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério Pereira Araújo, Mengying Wang, snagel) Modified: nutch/branches/2.x/conf/suffix-urlfilter.txt.template URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/suffix-urlfilter.txt.template?rev=1643412r1=1643411r2=1643412view=diff == --- nutch/branches/2.x/conf/suffix-urlfilter.txt.template (original) +++ nutch/branches/2.x/conf/suffix-urlfilter.txt.template Fri Dec 5 19:53:35 2014 @@ -16,8 +16,19 @@ # case-insensitive, allow unknown suffixes +I -# uncomment the line below to filter on url path -#+P + +# filter on URL path only ++P +# comment out to filter on complete URL +# but be aware that the pattern +#.com +# will then reject +#http://xyz.com +#http://xyz.com/search?q=foo.com +# while the pattern +#.mp3 +# will not apply to (URLs will pass) +#http://xyz.com/music.mp3?q=abc ### prohibit these # pictures Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1643412r1=1643411r2=1643412view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Dec 5 19:53:35 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1877 Suffix URL filter to ignore query string by default (markus via snagel) + * NUTCH-1890 Major Typo in Documentation for Integrating Nutch and Solr (Boadu Akoto Charles Jnr, mattmann) * NUTCH-1887 Specify HTMLMapper to use in TikaParser (jnioche) Modified: nutch/trunk/conf/suffix-urlfilter.txt.template URL: http://svn.apache.org/viewvc/nutch/trunk/conf/suffix-urlfilter.txt.template?rev=1643412r1=1643411r2=1643412view=diff == --- nutch/trunk/conf/suffix-urlfilter.txt.template (original) +++ nutch/trunk/conf/suffix-urlfilter.txt.template Fri Dec 5 19:53:35 2014 @@ -2,8 +2,19 @@ # case-insensitive, allow unknown suffixes +I -# uncomment the line below to filter on url path -#+P + +# filter on URL path only ++P +# comment out to filter on complete URL +# but be aware that the pattern +#.com +# will then reject +#http://xyz.com +#http://xyz.com/search?q=foo.com +# while the pattern +#.mp3 +# will not apply to (URLs will pass) +#http://xyz.com/music.mp3?q=abc ### prohibit these # pictures
svn commit: r1655169 - in /nutch/branches/2.x: CHANGES.txt src/plugin/parse-tika/ivy.xml src/plugin/parse-tika/plugin.xml
Author: snagel Date: Tue Jan 27 21:45:39 2015 New Revision: 1655169 URL: http://svn.apache.org/r1655169 Log: NUTCH-1893 Parse-tika failes to parse feed files Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/plugin/parse-tika/ivy.xml nutch/branches/2.x/src/plugin/parse-tika/plugin.xml Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1655169r1=1655168r2=1655169view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Tue Jan 27 21:45:39 2015 @@ -2,7 +2,7 @@ Nutch Change Log Current Development 2.4-SNAPSHOT -NUTCH-XX +* NUTCH-1893 Parse-tika failes to parse feed files (Mengying Wang via snagel) Nutch 2.3 Release 08012015 (ddmm) Release Report - http://s.apache.org/nutch_2.3 Modified: nutch/branches/2.x/src/plugin/parse-tika/ivy.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/ivy.xml?rev=1655169r1=1655168r2=1655169view=diff == --- nutch/branches/2.x/src/plugin/parse-tika/ivy.xml (original) +++ nutch/branches/2.x/src/plugin/parse-tika/ivy.xml Tue Jan 27 21:45:39 2015 @@ -39,6 +39,7 @@ dependency org=org.apache.tika name=tika-parsers rev=1.6 conf=*-default exclude org=org.apache.tika name=tika-core / /dependency +override module=rome rev=0.9/ /dependencies /ivy-module Modified: nutch/branches/2.x/src/plugin/parse-tika/plugin.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/plugin.xml?rev=1655169r1=1655168r2=1655169view=diff == --- nutch/branches/2.x/src/plugin/parse-tika/plugin.xml (original) +++ nutch/branches/2.x/src/plugin/parse-tika/plugin.xml Tue Jan 27 21:45:39 2015 @@ -55,7 +55,7 @@ library name=poi-ooxml-3.11-beta2.jar/ library name=poi-ooxml-schemas-3.11-beta2.jar/ library name=poi-scratchpad-3.11-beta2.jar/ - library name=rome-1.0.jar/ + library name=rome-0.9.jar/ library name=slf4j-api-1.6.1.jar/ library name=tagsoup-1.2.1.jar/ library name=tika-parsers-1.6.jar/
svn commit: r1651193 - in /nutch/trunk: CHANGES.txt build.xml
Author: snagel Date: Mon Jan 12 20:45:16 2015 New Revision: 1651193 URL: http://svn.apache.org/r1651193 Log: NUTCH-1881 ant target resolve-default to keep test libs Modified: nutch/trunk/CHANGES.txt nutch/trunk/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1651193r1=1651192r2=1651193view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Jan 12 20:45:16 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1881 ant target resolve-default to keep test libs (snagel) + * NUTCH-1660 Index filter for Page's latitude and longitude (Yasin Kılınç, lewismc) * NUTCH-1140 index-more plugin, resetTitle creates multiple values in title field (Joe Liedtke, kaveh minooie via snagel) Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1651193r1=1651192r2=1651193view=diff == --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Mon Jan 12 20:45:16 2015 @@ -468,13 +468,13 @@ !-- == -- !-- target: resolve = -- - target name=resolve-default depends=clean-lib description=-- resolve and retrieve dependencies with ivy + target name=resolve-default depends=clean-default-lib description=-- resolve and retrieve dependencies with ivy ivy:resolve file=${ivy.file} conf=default log=download-only/ ivy:retrieve pattern=${build.lib.dir}/[artifact]-[revision].[ext] symlink=false log=quiet/ antcall target=copy-libs/ /target - target name=resolve-test depends=clean-lib, init description=-- resolve and retrieve dependencies with ivy + target name=resolve-test depends=clean-test-lib, init description=-- resolve and retrieve dependencies with ivy ivy:resolve file=${ivy.file} conf=test log=download-only/ ivy:retrieve pattern=${test.build.lib.dir}/[artifact]-[revision].[ext] symlink=false log=quiet/ antcall target=copy-libs/ @@ -822,8 +822,15 @@ /target !-- target: clean-lib === -- - target name=clean-lib description=-- clean the project libraries directory (dependencies) -delete includeemptydirs=true dir=${build.lib.dir}/ + target name=clean-lib depends=clean-default-lib, clean-test-lib + description=-- clean the project libraries directories (dependencies: default + test) + /target + !-- target: clean-default-lib === -- + target name=clean-default-lib description=-- clean the project libraries directory (dependencies) +delete includeemptydirs=true dir=${build.lib.dir}/ + /target + !-- target: clean-test-lib === -- + target name=clean-test-lib description=-- clean the project test libraries directory (dependencies) delete includeemptydirs=true dir=${test.build.lib.dir}/ /target
svn commit: r1650181 - in /nutch/trunk: CHANGES.txt src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java src/plugin/index-more/src/test/org/apache/nutch/indexer/more/Te
Author: snagel Date: Wed Jan 7 22:25:18 2015 New Revision: 1650181 URL: http://svn.apache.org/r1650181 Log: NUTCH-1140 index-more plugin, resetTitle creates multiple values in title field Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1650181r1=1650180r2=1650181view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jan 7 22:25:18 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1140 index-more plugin, resetTitle creates multiple values in title field (Joe Liedtke, kaveh minooie via snagel) + * NUTCH-1904 Schema for Solr4 doesn't include _version_ field (mattmann) * NUTCH-1897 Easier debugging of plugin XML errors (markus) Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1650181r1=1650180r2=1650181view=diff == --- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed Jan 7 22:25:18 2015 @@ -289,7 +289,7 @@ public class MoreIndexingFilter implemen private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) { String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION); -if (contentDisposition == null) +if (contentDisposition == null || doc.getFieldValue(title) != null) return doc; for (int i=0; ipatterns.length; i++) { Modified: nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1650181r1=1650180r2=1650181view=diff == --- nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Wed Jan 7 22:25:18 2015 @@ -82,11 +82,21 @@ public class TestMoreIndexingFilter { MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); -NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(text, new ParseData( - new ParseStatus(), title, new Outlink[0], metadata)), new Text( -http://www.example.com/;), new CrawlDatum(), new Inlinks()); +Text url = new Text(http://www.example.com/;); +ParseImpl parseImpl = new ParseImpl(text, new ParseData( +new ParseStatus(), title, new Outlink[0], metadata)); + +NutchDocument doc = new NutchDocument(); +doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); Assert.assertEquals(content-disposition not detected, filename.ext, doc.getFieldValue(title)); + +/* NUTCH-1140: do not add second title to avoid a multi-valued title field */ +doc = new NutchDocument(); +doc.add(title, title); +doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); +Assert.assertEquals(do not add second title by content-disposition, +title, doc.getFieldValue(title)); } private void assertParts(String[] parts, int count, String... expected) {
svn commit: r1670442 - /nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
Author: snagel Date: Tue Mar 31 19:28:14 2015 New Revision: 1670442 URL: http://svn.apache.org/r1670442 Log: NUTCH-1979 CrawlDbReader to implement Tool: fix unit test Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java?rev=1670442r1=1670441r2=1670442view=diff == --- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java (original) +++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java Tue Mar 31 19:28:14 2015 @@ -27,6 +27,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; import org.apache.nutch.util.NutchConfiguration; import org.junit.After; import org.junit.Assert; @@ -113,7 +114,7 @@ public class TestCrawlDbMerger { String url = it.next(); LOG.fine(url= + url); CrawlDatum cd = expected.get(url); - CrawlDatum res = reader.get(crawlDb, url, conf); + CrawlDatum res = reader.get(crawlDb, url, new JobConf(conf)); LOG.fine( - + res); System.out.println(url= + url); System.out.println( cd + cd);
svn commit: r1669692 - in /nutch: branches/2.x/ branches/2.x/conf/ branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ branches/2.x/src/plugin/protocol-httpclient/src/java/or
Author: snagel Date: Fri Mar 27 21:42:35 2015 New Revision: 1669692 URL: http://svn.apache.org/r1669692 Log: NUTCH-1941 Optional rolling http.agent.names Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/nutch-default.xml nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1669692r1=1669691r2=1669692view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri Mar 27 21:42:35 2015 @@ -2,6 +2,8 @@ Nutch Change Log Current Development 2.4-SNAPSHOT +* NUTCH-1941 Optional rolling http.agent.name's (Asitang Mishra, lewismc via snagel) + * NUTCH-1925 Upgrade to Apache Tika 1.7 palsulich.p2.v2.patch (Tyler Palsulich via lewismc) * NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus) Modified: nutch/branches/2.x/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1669692r1=1669691r2=1669692view=diff == --- nutch/branches/2.x/conf/nutch-default.xml (original) +++ nutch/branches/2.x/conf/nutch-default.xml Fri Mar 27 21:42:35 2015 @@ -162,6 +162,26 @@ /property property + namehttp.agent.rotate/name + valuefalse/value + description +If true, instead of http.agent.name, alternating agent names are +chosen from a list provided via http.agent.rotate.file. + /description +/property + +property + namehttp.agent.rotate.file/name + valueagents.txt/value + description +File containing alternative user agent names to be used instead of +http.agent.name on a rotating basis if http.agent.rotate is true. +Each line of the file should contain exactly one agent +specification including name, version, description, URL, etc. + /description +/property + +property namehttp.agent.host/name value/value descriptionName or IP address of the host on which the Nutch crawler Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1669692r1=1669691r2=1669692view=diff == --- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri Mar 27 21:42:35 2015 @@ -17,16 +17,22 @@ package org.apache.nutch.protocol.http.api; // JDK imports +import java.io.BufferedReader; import java.io.IOException; +import java.io.Reader; import java.net.URL; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Set; +import java.util.concurrent.ThreadLocalRandom; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.avro.util.Utf8; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; @@ -53,6 +59,8 @@ public abstract class HttpBase implement private HttpRobotRulesParser robots = null; + private ArrayListString userAgentNames = null; + /** The proxy hostname. */ protected String proxyHost = null; @@ -132,6 +140,45 @@ public abstract class HttpBase implement this.responseTime = conf.getBoolean(http.store.responsetime, true); this.robots.setConf(conf); +// NUTCH-1941: read list of alternating agent names +if (conf.getBoolean(http.agent.rotate, false)) { + String agentsFile = conf.get(http.agent.rotate.file, agents.txt); + BufferedReader br = null; + try { +Reader reader = conf.getConfResourceAsReader(agentsFile); +br = new BufferedReader(reader); +userAgentNames = new ArrayListString(); +String word = ; +while ((word = br.readLine()) != null) { + if (!word.trim().isEmpty
svn commit: r1678824 - in /nutch/trunk: CHANGES.txt src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
Author: snagel Date: Mon May 11 21:04:59 2015 New Revision: 1678824 URL: http://svn.apache.org/r1678824 Log: NUTCH-1998 Add support for user-defined file extension to CommonCrawlDataDumper: fix unit test Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1678824r1=1678823r2=1678824view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon May 11 21:04:59 2015 @@ -2,7 +2,7 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT -* NUTCH-1988 Add support for user-defined file extension to CommonCrawlDataDumper (totaro via mattmann) +* NUTCH-1998 Add support for user-defined file extension to CommonCrawlDataDumper (totaro via mattmann) * NUTCH-1873 Solr IndexWriter/Job to report number of docs indexed. (snagel via lewismc) @@ -54,8 +54,6 @@ Release Report: http://s.apache.org/nutc * NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro via mattmann) -* NUTCH-1988 Make nested output directory dump optional (Michael Joyce via mattmann) - * NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules parsing (mattmann, snagel) * NUTCH-1986 Clarify Elastic Search Indexer Plugin Settings (Michael Joyce via mattmann) Modified: nutch/trunk/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java?rev=1678824r1=1678823r2=1678824view=diff == --- nutch/trunk/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java (original) +++ nutch/trunk/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java Mon May 11 21:04:59 2015 @@ -101,20 +101,16 @@ public class TestCommonCrawlDataDumper { CommonCrawlDataDumper dumper = new CommonCrawlDataDumper( new CommonCrawlConfig()); - dumper.dump(tempDir, sampleSegmentDir, false, null, false); + dumper.dump(tempDir, sampleSegmentDir, false, null, false, ); CollectionFile tempFiles = FileUtils.listFiles(tempDir, FileFilterUtils.fileFileFilter(), FileFilterUtils.directoryFileFilter()); - boolean hasAll = true; for (String expectedFileName : crawledFiles) { - if (!hasFile(expectedFileName, tempFiles)) { - hasAll = false; - break; - } + assertTrue(Missed file + expectedFileName + in dump, + hasFile(expectedFileName, tempFiles)); } - assertTrue(hasAll); }
svn commit: r1680110 - in /nutch/trunk: CHANGES.txt conf/log4j.properties
Author: snagel Date: Mon May 18 21:39:23 2015 New Revision: 1680110 URL: http://svn.apache.org/r1680110 Log: NUTCH-2013 Fetcher: missing logs fetching ... on stdout Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/log4j.properties Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1680110r1=1680109r2=1680110view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon May 18 21:39:23 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2013 Fetcher: missing logs fetching ... on stdout (snagel) + * NUTCH-2014 Fetcher hang-up on completion (snagel) * NUTCH-2011 Endpoint to support realtime JSON output from the fetcher (Sujen Shah via mattmann) Modified: nutch/trunk/conf/log4j.properties URL: http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1680110r1=1680109r2=1680110view=diff == --- nutch/trunk/conf/log4j.properties (original) +++ nutch/trunk/conf/log4j.properties Mon May 18 21:39:23 2015 @@ -28,6 +28,11 @@ log4j.logger.org.apache.nutch.crawl.Craw log4j.logger.org.apache.nutch.crawl.Injector=INFO,cmdstdout log4j.logger.org.apache.nutch.crawl.Generator=INFO,cmdstdout log4j.logger.org.apache.nutch.fetcher.Fetcher=INFO,cmdstdout +log4j.logger.org.apache.nutch.fetcher.FetcherThread=INFO,cmdstdout +log4j.logger.org.apache.nutch.fetcher.FetcherItem=INFO,cmdstdout +log4j.logger.org.apache.nutch.fetcher.FetcherItemQueue=INFO,cmdstdout +log4j.logger.org.apache.nutch.fetcher.FetcherItemQueues=INFO,cmdstdout +log4j.logger.org.apache.nutch.fetcher.QueueFeeder=INFO,cmdstdout log4j.logger.org.apache.nutch.parse.ParseSegment=INFO,cmdstdout log4j.logger.org.apache.nutch.crawl.CrawlDbReader=INFO,cmdstdout log4j.logger.org.apache.nutch.crawl.CrawlDbMerger=INFO,cmdstdout
svn commit: r1680109 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java
Author: snagel Date: Mon May 18 21:35:03 2015 New Revision: 1680109 URL: http://svn.apache.org/r1680109 Log: NUTCH-2014 Fetcher hang-up on completion Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1680109r1=1680108r2=1680109view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon May 18 21:35:03 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2014 Fetcher hang-up on completion (snagel) + * NUTCH-2011 Endpoint to support realtime JSON output from the fetcher (Sujen Shah via mattmann) * NUTCH-2006 IndexingFiltersChecker to take custom metadata as input (jnioche) Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1680109r1=1680108r2=1680109view=diff == --- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon May 18 21:35:03 2015 @@ -214,7 +214,7 @@ public class Fetcher extends NutchTool i for (int i = 0; i threadCount; i++) { // spawn threads FetcherThread t = new FetcherThread(getConf(), getActiveThreads(), fetchQueues, - feeder, spinWaiting, lastRequestStart, reporter, activeThreads, segmentName, + feeder, spinWaiting, lastRequestStart, reporter, errors, segmentName, parsing, output, storingContent, pages, bytes); fetcherThreads.add(t); t.start();
svn commit: r1674399 - in /nutch/trunk: ./ conf/ src/java/org/apache/nutch/protocol/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-ftp/src/java/org/apache/nutch/
Author: snagel Date: Fri Apr 17 20:49:19 2015 New Revision: 1674399 URL: http://svn.apache.org/r1674399 Log: NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules parsing Removed: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRules.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/log4j.properties nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674399r1=1674398r2=1674399view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Apr 17 20:49:19 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules parsing (mattmann, snagel) + * NUTCH-1986 Clarify Elastic Search Indexer Plugin Settings (Michael Joyce via mattmann) * NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via mattmann) Modified: nutch/trunk/conf/log4j.properties URL: http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1674399r1=1674398r2=1674399view=diff == --- nutch/trunk/conf/log4j.properties (original) +++ nutch/trunk/conf/log4j.properties Fri Apr 17 20:49:19 2015 @@ -54,6 +54,7 @@ log4j.logger.org.apache.nutch.indexer.In log4j.logger.org.apache.nutch.tools.FreeGenerator=INFO,cmdstdout log4j.logger.org.apache.nutch.util.domain.DomainStatistics=INFO,cmdstdout log4j.logger.org.apache.nutch.tools.CrawlDBScanner=INFO,cmdstdout +log4j.logger.org.apache.nutch.protocol.RobotRulesParser=INFO,cmdstdout log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN log4j.logger.org.apache.nutch=INFO Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1674399r1=1674398r2=1674399view=diff == --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Fri Apr 17 20:49:19 2015 @@ -118,6 +118,15 @@ /property property + namehttp.robot.rules.whitelist/name + value/value + descriptionComma separated list of hostnames or IP addresses to ignore + robot rules parsing for. Use with care and only if you are explicitly + allowed by the site owner to ignore the site's robots.txt! + /description +/property + +property namehttp.robots.403.allow/name valuetrue/value descriptionSome servers return HTTP status 403 (Forbidden) if Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1674399r1=1674398r2=1674399view=diff == --- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri Apr 17 20:49:19 2015 @@ -20,10 +20,15 @@ package org.apache.nutch.protocol; // JDK imports import java.io.File; import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; import java.io.LineNumberReader; +import java.net.MalformedURLException; import java.net.URL; -import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.Hashtable; +import java.util.Set; import java.util.StringTokenizer; // Commons Logging imports @@ -32,10 +37,11 @@ import org.slf4j.LoggerFactory; // Nutch imports import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.io.Text; - -import com.google.common.io.Files; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.util.NutchConfiguration; import crawlercommons.robots.BaseRobotRules; import crawlercommons.robots.SimpleRobotRules; @@ -46,8 +52,11 @@ import crawlercommons.robots.SimpleRobot * This class uses crawler-commons for handling the parsing of * {@code robots.txt} files. It emits SimpleRobotRules objects, which describe * the download permissions as described in SimpleRobotRulesParser. + * + * Protocol-specific implementations have to implement the method + * {@link getRobotRulesSet}. */ -public abstract class RobotRulesParser implements Configurable { +public abstract class RobotRulesParser implements Tool { public static final Logger LOG = LoggerFactory .getLogger
svn commit: r1674581 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/parse/ParseSegment.java src/java/org/apache/nutch/segment/SegmentChecker.java
Author: snagel Date: Sat Apr 18 20:41:13 2015 New Revision: 1674581 URL: http://svn.apache.org/r1674581 Log: NUTCH-1854 bin/crawl fails with a parsing fetcher Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674581r1=1674580r2=1674581view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Sat Apr 18 20:41:13 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel) + * NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro via mattmann) * NUTCH-1988 Make nested output directory dump optional (Michael Joyce via mattmann) @@ -12,7 +14,7 @@ Nutch Current Development 1.10-SNAPSHOT * NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via mattmann) -* NUTCH-1911 Imeprove DomainStatistics tool command line parsing (Michael Joyce via mattmann) +* NUTCH-1911 Improve DomainStatistics tool command line parsing (Michael Joyce via mattmann) * NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel) Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1674581r1=1674580r2=1674581view=diff == --- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Sat Apr 18 20:41:13 2015 @@ -22,6 +22,7 @@ import org.slf4j.LoggerFactory; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.SignatureFactory; +import org.apache.nutch.segment.SegmentChecker; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.*; @@ -32,6 +33,7 @@ import org.apache.nutch.net.protocols.Re import org.apache.nutch.protocol.*; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; +import org.apache.hadoop.fs.FileSystem; import org.apache.nutch.util.*; import org.apache.hadoop.fs.Path; @@ -198,6 +200,11 @@ public class ParseSegment extends Config } public void parse(Path segment) throws IOException { + if (SegmentChecker.isParsed(segment, FileSystem.get(getConf( { + LOG.warn(Segment: + segment + + already parsed!! Skipped parsing this segment!!); // NUTCH-1854 + return; + } SimpleDateFormat sdf = new SimpleDateFormat(-MM-dd HH:mm:ss); long start = System.currentTimeMillis(); Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java?rev=1674581r1=1674580r2=1674581view=diff == --- nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Sat Apr 18 20:41:13 2015 @@ -115,4 +115,16 @@ public class SegmentChecker { } } -} \ No newline at end of file + /** + * Check the segment to see if it is has been parsed before. + */ + public static boolean isParsed(Path segment, FileSystem fs) + throws IOException { + + if (fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME))) + return true; + return false; + + } + +}
svn commit: r1672939 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/ivy/ivy.xml trunk/CHANGES.txt trunk/ivy/ivy.xml
Author: snagel Date: Sat Apr 11 22:07:52 2015 New Revision: 1672939 URL: http://svn.apache.org/r1672939 Log: NUTCH-1981 Upgrade to icu4j 55.1 Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/ivy/ivy.xml nutch/trunk/CHANGES.txt nutch/trunk/ivy/ivy.xml Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1672939r1=1672938r2=1672939view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Sat Apr 11 22:07:52 2015 @@ -2,6 +2,8 @@ Nutch Change Log Current Development 2.4-SNAPSHOT +* NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel) + * NUTCH-1944 Index HTML raw content (meabed via mattmann) * NUTCH-1941 Optional rolling http.agent.name's (Asitang Mishra, lewismc via snagel) Modified: nutch/branches/2.x/ivy/ivy.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1672939r1=1672938r2=1672939view=diff == --- nutch/branches/2.x/ivy/ivy.xml (original) +++ nutch/branches/2.x/ivy/ivy.xml Sat Apr 11 22:07:52 2015 @@ -54,7 +54,7 @@ exclude org=org.mortbay.jetty name=jsp-* / /dependency -dependency org=com.ibm.icu name=icu4j rev=4.0.1 / +dependency org=com.ibm.icu name=icu4j rev=55.1 / dependency org=org.apache.tika name=tika-core rev=1.7 / dependency org=com.googlecode.juniversalchardet name=juniversalchardet rev=1.0.3/ Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1672939r1=1672938r2=1672939view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Sat Apr 11 22:07:52 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel) + * NUTCH-1960 JUnit test for dump method of CommonCrawlDataDumper (Giuseppe Totaro via mattmann) * NUTCH-1983 CommonCrawlDumper and FileDumper don't dump correct JSON (mattmann) Modified: nutch/trunk/ivy/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1672939r1=1672938r2=1672939view=diff == --- nutch/trunk/ivy/ivy.xml (original) +++ nutch/trunk/ivy/ivy.xml Sat Apr 11 22:07:52 2015 @@ -62,7 +62,7 @@ /dependency dependency org=org.apache.tika name=tika-core rev=1.7 / - dependency org=com.ibm.icu name=icu4j rev=4.0.1 / + dependency org=com.ibm.icu name=icu4j rev=55.1 / dependency org=xerces name=xercesImpl rev=2.9.1 / dependency org=xerces name=xmlParserAPIs rev=2.6.2 /
svn commit: r1687604 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/LinkDb.java
Author: snagel Date: Thu Jun 25 18:41:26 2015 New Revision: 1687604 URL: http://svn.apache.org/r1687604 Log: NUTCH-2000 Link inversion fails with .locked already exists Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1687604r1=1687603r2=1687604view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jun 25 18:41:26 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2000 Link inversion fails with .locked already exists (jnioche, snagel) + * NUTCH-2036 Adding some continuous crawl goodies to the crawl script (jorge, snagel) * NUTCH-2039 Relevance based scoring filter (Sujen Shah, lewismc via mattmann) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1687604r1=1687603r2=1687604view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Jun 25 18:41:26 2015 @@ -196,6 +196,7 @@ public class LinkDb extends NutchTool im job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) { LOG.warn(LinkDb: internal and external links are ignored! + Nothing to do, actually. Exiting.); + LockUtil.removeLockFile(fs, lock); return; }
svn commit: r1682103 - in /nutch/trunk: CHANGES.txt src/bin/nutch
Author: snagel Date: Wed May 27 19:31:51 2015 New Revision: 1682103 URL: http://svn.apache.org/r1682103 Log: NUTCH-2007 add test libs to classpath of bin/nutch junit Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/bin/nutch Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682103r1=1682102r2=1682103view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed May 27 19:31:51 2015 @@ -2,7 +2,9 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT -* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist +* NUTCH-2007 add test libs to classpath of bin/nutch junit (snagel) + +* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist (totaro) * NUTCH-2013 Fetcher: missing logs fetching ... on stdout (snagel) Modified: nutch/trunk/src/bin/nutch URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1682103r1=1682102r2=1682103view=diff == --- nutch/trunk/src/bin/nutch (original) +++ nutch/trunk/src/bin/nutch Wed May 27 19:31:51 2015 @@ -270,6 +270,11 @@ elif [ $COMMAND = plugin ] ; then CLASS=org.apache.nutch.plugin.PluginRepository elif [ $COMMAND = junit ] ; then CLASSPATH=$CLASSPATH:$NUTCH_HOME/test/classes/ + if $local; then +for f in $NUTCH_HOME/test/lib/*.jar; do + CLASSPATH=${CLASSPATH}:$f; +done + fi CLASS=org.junit.runner.JUnitCore elif [ $COMMAND = startserver ] ; then CLASS=org.apache.nutch.service.NutchServer
svn commit: r1691436 - /nutch/trunk/CHANGES.txt
Author: snagel Date: Thu Jul 16 19:52:00 2015 New Revision: 1691436 URL: http://svn.apache.org/r1691436 Log: remove duplicate entries Modified: nutch/trunk/CHANGES.txt Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1691436r1=1691435r2=1691436view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jul 16 19:52:00 2015 @@ -100,16 +100,6 @@ Release Report: http://s.apache.org/nutc * NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel) -* NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro via mattmann) - -* NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules parsing (mattmann, snagel) - -* NUTCH-1986 Clarify Elastic Search Indexer Plugin Settings (Michael Joyce via mattmann) - -* NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via mattmann) - -* NUTCH-1911 Improve DomainStatistics tool command line parsing (Michael Joyce via mattmann) - * NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel) * NUTCH-1960 JUnit test for dump method of CommonCrawlDataDumper (Giuseppe Totaro via mattmann)
svn commit: r1714655 - in /nutch/branches/2.x: CHANGES.txt conf/schema.xml
Author: snagel Date: Mon Nov 16 20:29:33 2015 New Revision: 1714655 URL: http://svn.apache.org/viewvc?rev=1714655=rev Log: NUTCH-2130 copyField rawcontent creates error within schema.xml Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/schema.xml Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1714655=1714654=1714655=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Mon Nov 16 20:29:33 2015 @@ -3,6 +3,8 @@ Nutch Change Log Nutch 2.3.1 Release 22092015 (ddmm) Release Report - http://s.apache.org/nutch_2.3.1 +* NUTCH-2130 copyField rawcontent creates error within schema.xml (Sherban Drulea, lewismc, snagel) + * NUTCH-2018 Ensure that the Docker containers for Nutch 2.X are part of the Release Management Documentation (lewismc) * NUTCH-2105 Update Nutch Cassandra Dockerfile to work with Gora Nutch 2.3.1 (lewismc) Modified: nutch/branches/2.x/conf/schema.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1714655=1714654=1714655=diff == --- nutch/branches/2.x/conf/schema.xml (original) +++ nutch/branches/2.x/conf/schema.xml Mon Nov 16 20:29:33 2015 @@ -32,6 +32,7 @@ + + + + + id text @@ -367,7 +374,6 @@ or to add multiple fields to the same field for easier/faster searching. --> -
svn commit: r1707360 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/FetcherThread.java
Author: snagel Date: Wed Oct 7 19:02:42 2015 New Revision: 1707360 URL: http://svn.apache.org/viewvc?rev=1707360=rev Log: NUTCH-2124 Fetcher following same redirect again and again Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1707360=1707359=1707360=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Oct 7 19:02:42 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2124 Fetcher following same redirect again and again (Yogendra Kumar Soni via snagel) + * NUTCH-2123 Seed List REST API returns Text but headers indicate/require JSON (Aron Ahmadia, Sujen Shah via mattmann) Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java?rev=1707360=1707359=1707360=diff == --- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java (original) +++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java Wed Oct 7 19:02:42 2015 @@ -325,7 +325,7 @@ public class FetcherThread extends Threa newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR); if (redirUrl != null) { - queueRedirect(redirUrl, fit); + fit = queueRedirect(redirUrl, fit); } } break; @@ -346,7 +346,7 @@ public class FetcherThread extends Threa Text redirUrl = handleRedirect(fit.url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR); if (redirUrl != null) { -queueRedirect(redirUrl, fit); +fit = queueRedirect(redirUrl, fit); } else { // stop redirecting redirecting = false; @@ -485,7 +485,7 @@ public class FetcherThread extends Threa } } - private void queueRedirect(Text redirUrl, FetchItem fit) + private FetchItem queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore()); @@ -506,6 +506,7 @@ public class FetcherThread extends Threa reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); } +return fit; } private void logError(Text url, String message) {
svn commit: r1704425 - in /nutch/trunk: ./ src/plugin/lib-selenium/ src/plugin/protocol-interactiveselenium/ src/plugin/protocol-selenium/
Author: snagel Date: Mon Sep 21 21:14:55 2015 New Revision: 1704425 URL: http://svn.apache.org/viewvc?rev=1704425=rev Log: NUTCH-2106 Runtime to contain Selenium and dependencies only once Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/lib-selenium/build-ivy.xml nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt nutch/trunk/src/plugin/lib-selenium/ivy.xml nutch/trunk/src/plugin/lib-selenium/plugin.xml nutch/trunk/src/plugin/protocol-interactiveselenium/build-ivy.xml nutch/trunk/src/plugin/protocol-interactiveselenium/ivy.xml nutch/trunk/src/plugin/protocol-interactiveselenium/plugin.xml nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml nutch/trunk/src/plugin/protocol-selenium/ivy.xml nutch/trunk/src/plugin/protocol-selenium/plugin.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1704425=1704424=1704425=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Sep 21 21:14:55 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2106 Runtime to contain Selenium and dependencies only once (snagel) + * NUTCH-2104 Add documentation to the protocol-selenium plugin Readme file re: selenium grid implementation (Kim Whitehall via mattmann) Modified: nutch/trunk/src/plugin/lib-selenium/build-ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build-ivy.xml?rev=1704425=1704424=1704425=diff == --- nutch/trunk/src/plugin/lib-selenium/build-ivy.xml (original) +++ nutch/trunk/src/plugin/lib-selenium/build-ivy.xml Mon Sep 21 21:14:55 2015 @@ -48,7 +48,7 @@ - + Modified: nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt?rev=1704425=1704424=1704425=diff == --- nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt (original) +++ nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt Mon Sep 21 21:14:55 2015 @@ -1,6 +1,9 @@ 1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml -2. Upgrade Tika's own dependencies in src/plugin/lib-selenium/plugin.xml - To get the list of dependencies and their versions execute: - $ ant -f ./build-ivy.xml - $ ls lib | sed 's/^/ /g' +2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml + + To get a list of dependencies and their versions execute: +$ ant -f ./build-ivy.xml +$ ls lib | sed 's/^/ \n \n <\/library>/g' + + Note that all dependent libraries are exported for a "library" plugin ("lib-selenium"). Modified: nutch/trunk/src/plugin/lib-selenium/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1704425=1704424=1704425=diff == --- nutch/trunk/src/plugin/lib-selenium/ivy.xml (original) +++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Mon Sep 21 21:14:55 2015 @@ -27,7 +27,7 @@ - + Modified: nutch/trunk/src/plugin/lib-selenium/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1704425=1704424=1704425=diff == --- nutch/trunk/src/plugin/lib-selenium/plugin.xml (original) +++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Mon Sep 21 21:14
svn commit: r1718678 - in /nutch/trunk: conf/nutch-default.xml default.properties src/bin/nutch
Author: snagel Date: Tue Dec 8 19:18:19 2015 New Revision: 1718678 URL: http://svn.apache.org/viewvc?rev=1718678=rev Log: Update Nutch trunk for new development: 1.11 -> 1.12 Modified: nutch/trunk/conf/nutch-default.xml nutch/trunk/default.properties nutch/trunk/src/bin/nutch Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1718678=1718677=1718678=diff == --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Tue Dec 8 19:18:19 2015 @@ -164,7 +164,7 @@ http.agent.version - Nutch-1.11-SNAPSHOT + Nutch-1.12-SNAPSHOT A version string to advertise in the User-Agent header. Modified: nutch/trunk/default.properties URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1718678=1718677=1718678=diff == --- nutch/trunk/default.properties (original) +++ nutch/trunk/default.properties Tue Dec 8 19:18:19 2015 @@ -14,7 +14,7 @@ # limitations under the License. name=apache-nutch -version=1.11-SNAPSHOT +version=1.12-SNAPSHOT final.name=${name}-${version} year=2015 Modified: nutch/trunk/src/bin/nutch URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1718678=1718677=1718678=diff == --- nutch/trunk/src/bin/nutch (original) +++ nutch/trunk/src/bin/nutch Tue Dec 8 19:18:19 2015 @@ -53,7 +53,7 @@ done # if no args specified, show usage if [ $# = 0 ]; then - echo "nutch 1.11" + echo "nutch 1.12" echo "Usage: nutch COMMAND" echo "where COMMAND is one of:" echo " readdbread / dump crawl db"
svn commit: r1717537 - in /nutch/branches/2.x: CHANGES.txt src/plugin/subcollection/plugin.xml src/plugin/urlnormalizer-regex/plugin.xml
Author: snagel Date: Tue Dec 1 21:17:14 2015 New Revision: 1717537 URL: http://svn.apache.org/viewvc?rev=1717537=rev Log: NUTCH-2107 plugin.xml to validate against plugin.dtd Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/plugin/subcollection/plugin.xml nutch/branches/2.x/src/plugin/urlnormalizer-regex/plugin.xml Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1717537=1717536=1717537=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Tue Dec 1 21:17:14 2015 @@ -3,6 +3,8 @@ Nutch Change Log Nutch 2.3.1 Release 22092015 (ddmm) Release Report - http://s.apache.org/nutch_2.3.1 +* NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel) + * NUTCH-2130 copyField rawcontent creates error within schema.xml (Sherban Drulea, lewismc, snagel) * NUTCH-2018 Ensure that the Docker containers for Nutch 2.X are part of the Release Management Documentation (lewismc) Modified: nutch/branches/2.x/src/plugin/subcollection/plugin.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/subcollection/plugin.xml?rev=1717537=1717536=1717537=diff == --- nutch/branches/2.x/src/plugin/subcollection/plugin.xml (original) +++ nutch/branches/2.x/src/plugin/subcollection/plugin.xml Tue Dec 1 21:17:14 2015 @@ -21,16 +21,16 @@ version="1.0.0" provider-name="apache.org"> - - - - + + + + Modified: nutch/branches/2.x/src/plugin/urlnormalizer-regex/plugin.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/plugin.xml?rev=1717537=1717536=1717537=diff == --- nutch/branches/2.x/src/plugin/urlnormalizer-regex/plugin.xml (original) +++ nutch/branches/2.x/src/plugin/urlnormalizer-regex/plugin.xml Tue Dec 1 21:17:14 2015 @@ -28,7 +28,7 @@ - +
svn commit: r1717536 - in /nutch/trunk: CHANGES.txt src/plugin/subcollection/plugin.xml src/plugin/urlnormalizer-regex/plugin.xml
Author: snagel Date: Tue Dec 1 21:15:21 2015 New Revision: 1717536 URL: http://svn.apache.org/viewvc?rev=1717536=rev Log: NUTCH-2107 plugin.xml to validate against plugin.dtd Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/subcollection/plugin.xml nutch/trunk/src/plugin/urlnormalizer-regex/plugin.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1717536=1717535=1717536=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Dec 1 21:15:21 2015 @@ -3,6 +3,8 @@ Nutch Change Log Nutch 1.11 Release 25/10/2015 (dd/mm/) Release Report: http://s.apache.org/nutch11 +* NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel) + * NUTCH-2177 Generator produces only one partition even in distributed mode (jnioche, snagel) * NUTCH-2158 Upgrade to Tika 1.11 (jnioche, snagel) Modified: nutch/trunk/src/plugin/subcollection/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/plugin.xml?rev=1717536=1717535=1717536=diff == --- nutch/trunk/src/plugin/subcollection/plugin.xml (original) +++ nutch/trunk/src/plugin/subcollection/plugin.xml Tue Dec 1 21:15:21 2015 @@ -21,16 +21,16 @@ version="1.0.0" provider-name="apache.org"> - - - - + + + + Modified: nutch/trunk/src/plugin/urlnormalizer-regex/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/plugin.xml?rev=1717536=1717535=1717536=diff == --- nutch/trunk/src/plugin/urlnormalizer-regex/plugin.xml (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/plugin.xml Tue Dec 1 21:15:21 2015 @@ -28,7 +28,7 @@ - +
svn commit: r1718223 - in /nutch/trunk: CHANGES.txt conf/contenttype-mapping.txt.template src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Author: snagel Date: Sun Dec 6 21:14:06 2015 New Revision: 1718223 URL: http://svn.apache.org/viewvc?rev=1718223=rev Log: NUTCH-2172 index-more: document format of contenttype-mapping.txt Added: nutch/trunk/conf/contenttype-mapping.txt.template Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1718223=1718222=1718223=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Sun Dec 6 21:14:06 2015 @@ -1,5 +1,7 @@ Nutch Change Log - + +* NUTCH-2172 index-more: document format of contenttype-mapping.txt (Nicola Tonellotto, snagel) + Nutch 1.11 Release 03/12/2015 (dd/mm/) Release Report: http://s.apache.org/nutch11 Added: nutch/trunk/conf/contenttype-mapping.txt.template URL: http://svn.apache.org/viewvc/nutch/trunk/conf/contenttype-mapping.txt.template?rev=1718223=auto == --- nutch/trunk/conf/contenttype-mapping.txt.template (added) +++ nutch/trunk/conf/contenttype-mapping.txt.template Sun Dec 6 21:14:06 2015 @@ -0,0 +1,22 @@ +# +# Mapping of detected content types (MIME types) to custom types (target types) +# used by the plugin index-more when filling the index field `type'. +# +# Note: The mappings defined in this file are only active if the property +# `moreIndexingFilter.mapMimeTypes' is true. +# +# Format (tab-separated plain text, comment lines start with `#'): +# +# [ ...] +# +# Examples (comment in to activate): +# +# map XHTML to HTML +#text/html application/xhtml+xml +# +# Map XHTML and HTML to a custom type "web page" +#web page text/html application/xhtml+xml +# +# map various office document formats to a custom type "office document" +#office document application/vnd.oasis.opendocument.text application/x-tika-msoffice application/msword +# Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1718223=1718222=1718223=diff == --- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sun Dec 6 21:14:06 2015 @@ -312,10 +312,12 @@ public class MoreIndexingFilter implemen } private void readConfiguration() throws IOException { +LOG.info("Reading content type mappings from file contenttype-mapping.txt"); BufferedReader reader = new BufferedReader( conf.getConfResourceAsReader("contenttype-mapping.txt")); String line; String parts[]; +boolean formatWarningShown = false; mimeMap = new HashMap<String, String>(); @@ -329,6 +331,12 @@ public class MoreIndexingFilter implemen for (int i = 1; i < parts.length; i++) { mimeMap.put(parts[i].trim(), parts[0].trim()); } +} else { + LOG.warn("Wrong format of line: {}", line); + if (!formatWarningShown) { +LOG.warn("Expected format:[ ...]"); +formatWarningShown = true; + } } } }
svn commit: r1718718 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java trunk/CHANGES.txt trunk/src/plugin/parse-html/src/jav
Author: snagel Date: Tue Dec 8 21:45:47 2015 New Revision: 1718718 URL: http://svn.apache.org/viewvc?rev=1718718=rev Log: NUTCH-2042 parse-html increase chunk size used to detect charset Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1718718=1718717=1718718=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Tue Dec 8 21:45:47 2015 @@ -3,6 +3,8 @@ Nutch Change Log Nutch 2.3.1 Release 22092015 (ddmm) Release Report - http://s.apache.org/nutch_2.3.1 +* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel) + * NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel) * NUTCH-2130 copyField rawcontent creates error within schema.xml (Sherban Drulea, lewismc, snagel) Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1718718=1718717=1718718=diff == --- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Tue Dec 8 21:45:47 2015 @@ -27,6 +27,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.nio.ByteBuffer; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -67,7 +68,8 @@ public class HtmlParser implements Parse // I used 1000 bytes at first, but found that some documents have // meta tag well past the first 1000 bytes. // (e.g. http://cn.promo.yahoo.com/customcare/music.html) - private static final int CHUNK_SIZE = 2000; + // NUTCH-2042 (cf. TIKA-357): increased to 8 kB + private static final int CHUNK_SIZE = 8192; // NUTCH-1006 Meta equiv with single quotes not accepted private static Pattern metaPattern = Pattern.compile( @@ -111,14 +113,8 @@ public class HtmlParser implements Parse // to just inflate each byte to a 16-bit value by padding. // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into // {U+0041, U+0082, U+00B7}. -String str = ""; -try { - str = new String(content.array(), content.arrayOffset() - + content.position(), length, Charset.forName("ASCII").toString()); -} catch (UnsupportedEncodingException e) { - // code should never come here, but just in case... - return null; -} +String str = new String(content.array(), content.arrayOffset() ++ content.position(), length, StandardCharsets.US_ASCII); Matcher metaMatcher = metaPattern.matcher(str); String encoding = null; Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1718718=1718717=1718718=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Dec 8 21:45:47 2015 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel) + * NUTCH-2172 index-more: document format of contenttype-mapping.txt (Nicola Tonellotto, snagel) Nutch 1.11 Release 03/12/2015 (dd/mm/) Modified: nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1718718=1718717=1718718=diff == --- nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original) +++ nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Tue Dec 8 21:45:47 2015 @@ -21,7 +21,7 @@ import java.util.ArrayList; import java.util.Map; import java.net.URL; import java.net.MalformedURLException; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.io.*; import java.util.regex.*; @@ -30,10 +30,8 @@ import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.w3c.dom.*; import org.apache.html.dom.*; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.protocol.Content; @@ -48,7 +
svn commit: r1723851 - in /nutch/branches/2.x: CHANGES.txt src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Author: snagel Date: Sat Jan 9 13:01:31 2016 New Revision: 1723851 URL: http://svn.apache.org/viewvc?rev=1723851=rev Log: NUTCH-2168 Parse-tika fails to retrieve parser Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1723851=1723850=1723851=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Sat Jan 9 13:01:31 2016 @@ -3,6 +3,8 @@ Nutch Change Log Nutch 2.3.1 Release 22092015 (ddmm) Release Report - http://s.apache.org/nutch_2.3.1 +* NUTCH-2168 Parse-tika fails to retrieve parser (snagel, Auro Miralles, lewismc) + * NUTCH-2169 Integrate index-html into Nutch build (snagel) * NUTCH-2143 GeneratorJob ignores batch id passed as argument (liuqibj, lewismc, snagel) Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1723851=1723850=1723851=diff == --- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Sat Jan 9 13:01:31 2016 @@ -207,7 +207,7 @@ public class TikaParser implements org.a this.tikaConfig = null; try { - tikaConfig = TikaConfig.getDefaultConfig(); + tikaConfig = new TikaConfig(this.getClass().getClassLoader()); } catch (Exception e2) { String message = "Problem loading default Tika configuration"; LOG.error(message, e2);
svn commit: r1723626 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/crawl/GeneratorJob.java
Author: snagel Date: Thu Jan 7 20:57:13 2016 New Revision: 1723626 URL: http://svn.apache.org/viewvc?rev=1723626=rev Log: NUTCH-2143 GeneratorJob ignores batch id passed as argument Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1723626=1723625=1723626=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Thu Jan 7 20:57:13 2016 @@ -3,6 +3,8 @@ Nutch Change Log Nutch 2.3.1 Release 22092015 (ddmm) Release Report - http://s.apache.org/nutch_2.3.1 +* NUTCH-2143 GeneratorJob ignores batch id passed as argument (liuqibj, lewismc, snagel) + * NUTCH-2042 parse-html increase chunk size used to detect charset (snagel) * NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel) Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1723626=1723625=1723626=diff == --- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Thu Jan 7 20:57:13 2016 @@ -163,17 +163,20 @@ public class GeneratorJob extends NutchT return fields; } + /** Generate a random batch id */ + public static String randomBatchId() { +long curTime = System.currentTimeMillis(); +int randomSeed = Math.abs(new Random().nextInt()); +String batchId = (curTime / 1000) + "-" + randomSeed; +return batchId; + } + public Map<String, Object> run(Map<String, Object> args) throws Exception { String batchId = (String) args.get(Nutch.ARG_BATCH); -if (batchId != null) { - getConf().set(GeneratorJob.BATCH_ID, batchId); -} else { - // generate batchId - long curTime = System.currentTimeMillis(); - int randomSeed = Math.abs(new Random().nextInt()); - batchId = (curTime / 1000) + "-" + randomSeed; - getConf().set(BATCH_ID, batchId); +if (batchId == null) { + batchId = randomBatchId(); } +getConf().set(BATCH_ID, batchId); // map to inverted subset due for fetch, sort by score Long topN = null; @@ -249,10 +252,15 @@ public class GeneratorJob extends NutchT if (topN != Long.MAX_VALUE) { LOG.info("GeneratorJob: topN: " + topN); } +String batchId = getConf().get(BATCH_ID); Map<String, Object> results = run(ToolUtil.toArgMap(Nutch.ARG_TOPN, topN, Nutch.ARG_CURTIME, curTime, Nutch.ARG_FILTER, filter, -Nutch.ARG_NORMALIZE, norm)); -String batchId = getConf().get(BATCH_ID); +Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId)); +if (batchId == null) { + // use generated random batch id + batchId = (String) results.get(BATCH_ID); +} + long finish = System.currentTimeMillis(); long generateCount = (Long) results.get(GENERATE_COUNT); LOG.info("GeneratorJob: finished at " + sdf.format(finish) @@ -290,11 +298,6 @@ public class GeneratorJob extends NutchT long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE; boolean filter = true, norm = true; -// generate batchId -int randomSeed = Math.abs(new Random().nextInt()); -String batchId = (curTime / 1000) + "-" + randomSeed; -getConf().set(BATCH_ID, batchId); - for (int i = 0; i < args.length; i++) { if ("-topN".equals(args[i])) { topN = Long.parseLong(args[++i]); @@ -307,9 +310,9 @@ public class GeneratorJob extends NutchT } else if ("-adddays".equals(args[i])) { long numDays = Integer.parseInt(args[++i]); curTime += numDays * 1000L * 60 * 60 * 24; - } else if ("-batchId".equals(args[i])) + } else if ("-batchId".equals(args[i])) { getConf().set(BATCH_ID, args[++i]); - else { + } else { System.err.println("Unrecognized arg " + args[i]); return -1; }
svn commit: r1716177 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
Author: snagel Date: Tue Nov 24 15:37:32 2015 New Revision: 1716177 URL: http://svn.apache.org/viewvc?rev=1716177=rev Log: NUTCH-2175 Typos in property descriptions Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1716177=1716176=1716177=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Nov 24 15:37:32 2015 @@ -3,6 +3,8 @@ Nutch Change Log Nutch 1.11 Release 25/10/2015 (dd/mm/) Release Report: http://s.apache.org/nutch11 +* NUTCH-2175 Typos in property descriptions in nutch-default.xml (Roannel Fernández Hernández via snagel) + * NUTCH-2069 Ignore external links based on domain (jnioche) * NUTCH-2173 String.join in FileDumper breaks the build (joyce) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1716177=1716176=1716177=diff == --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Tue Nov 24 15:37:32 2015 @@ -51,7 +51,7 @@ true The crawler is not restricted to the directories that you specified in the Urls file but it is jumping into the parent directories as well. For your own crawlings you can -change this bahavior (set to false) the way that only directories beneath the directories that you specify get +change this behavior (set to false) the way that only directories beneath the directories that you specify get crawled. @@ -209,7 +209,7 @@ 100 The number of times a thread will delay when trying to fetch a page. Each time it finds that a host is busy, it will wait - fetcher.server.delay. After http.max.delays attepts, it will give + fetcher.server.delay. After http.max.delays attempts, it will give up on the page for now. @@ -752,7 +752,7 @@ 5.0 The number of seconds the fetcher will delay between successive requests to the same server. Note that this might get - overriden by a Crawl-Delay from a robots.txt and is used ONLY if + overridden by a Crawl-Delay from a robots.txt and is used ONLY if fetcher.threads.per.queue is set to 1. @@ -1102,8 +1102,8 @@ plugin.auto-activation true Defines if some plugins that are not activated regarding - the plugin.includes and plugin.excludes properties must be automaticaly - activated if they are needed by some actived plugins. + the plugin.includes and plugin.excludes properties must be automatically + activated if they are needed by some active plugins. @@ -1218,14 +1218,13 @@ parsefilter.naivebayes.trainfile naivebayes-train.txt Set the name of the file to be used for Naive Bayes training. The format will be: -Each line contains two tab seperted parts +Each line contains two tab separated parts There are two columns/parts: -1. "1" or "0", "1" for relevent and "0" for irrelevent document. -3. Text (text that will be used for training) +1. "1" or "0", "1" for relevant and "0" for irrelevant documents. +2. Text (text that will be used for training) Each row will be considered a new "document" for the classifier. CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this classifier. - @@ -1272,7 +1271,7 @@ CAUTION: Set the parser.timeout to -1 or tika.htmlmapper.classname org.apache.tika.parser.html.IdentityHtmlMapper Classname of Tika HTMLMapper to use. Influences the elements included in the DOM and hence - the behaviour of the HTMLParseFilters. + the behavior of the HTMLParseFilters. --> @@ -1360,7 +1359,7 @@ CAUTION: Set the parser.timeout to -1 or scoring.depth.max 1000 Max depth value from seed allowed by default. - Can be overriden on a per-seed basis by specifying "_maxdepth_=VALUE" + Can be overridden on a per-seed basis by specifying "_maxdepth_=VALUE" as a seed metadata. This plugin adds a "_depth_" metadatum to the pages to track the distance from the seed it was found from. The depth is used to prioritise URLs in the generation step so that @@ -1373,7 +1372,7 @@ CAUTION: Set the parser.timeout to -1 or lang.analyze.max.length 2048 - The maximum bytes of data to uses to indentify + The maximum number of bytes used to identify the language (0 means full content analysis). The larger is this value, the better is the analysis, but the slowest it is. @@ -1667,7 +1666,7 @@ CAUTION: Set the parser.timeout to -1 or solr.loadbalance.urls - A comma-seperated value representing the Solr servers to be used when + A comma-separated value representing the Solr servers to be used when initi
nutch git commit: NUTCH-2272 Index checker server to optionally keep client connection open - removed from change log for release 1.12 as it is not included
Repository: nutch Updated Branches: refs/heads/master af6d8763f -> d29be63bd NUTCH-2272 Index checker server to optionally keep client connection open - removed from change log for release 1.12 as it is not included Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/d29be63b Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/d29be63b Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/d29be63b Branch: refs/heads/master Commit: d29be63bd44cfcaf7e0a1e340160df8a0ba2b600 Parents: af6d876 Author: Sebastian NagelAuthored: Thu Jun 23 17:09:19 2016 +0200 Committer: Sebastian Nagel Committed: Thu Jun 23 17:09:19 2016 +0200 -- CHANGES.txt | 1 - 1 file changed, 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/d29be63b/CHANGES.txt -- diff --git a/CHANGES.txt b/CHANGES.txt index 877f23b..ffcf5ae 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -37,7 +37,6 @@ Bug Improvement -[NUTCH-2272] - Index checker server to optionally keep client connection open [NUTCH-1233] - Rely on Tika for outlink extraction [NUTCH-1712] - Use MultipleInputs in Injector to make it a single mapreduce job [NUTCH-2172] - index-more: document format of contenttype-mapping.txt
[5/5] nutch git commit: fix unit test: CrawlDbFilter stil writes reduce output dirs as part-00000 (not part-r-00000)
fix unit test: CrawlDbFilter stil writes reduce output dirs as part-0 (not part-r-0) Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f5e430e5 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f5e430e5 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f5e430e5 Branch: refs/heads/master Commit: f5e430e557cc3768261ab86617b1b1589e120d92 Parents: 756f2a1 Author: Sebastian NagelAuthored: Thu Feb 25 22:37:47 2016 +0100 Committer: Sebastian Nagel Committed: Thu Feb 25 22:37:47 2016 +0100 -- src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/f5e430e5/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java -- diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java index 5c38037..38c38ed 100644 --- a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java @@ -106,7 +106,7 @@ public class TestCrawlDbFilter { job.setOutputValueClass(CrawlDatum.class); JobClient.runJob(job); -Path fetchlist = new Path(new Path(newCrawlDb, "part-r-0"), "data"); +Path fetchlist = new Path(new Path(newCrawlDb, "part-0"), "data"); ArrayList l = readContents(fetchlist);
[1/5] nutch git commit: update tests to reflect change of reduce outputs by new API (part-nnnnn -> part-r-nnnnn): all unit tests pass now
Repository: nutch Updated Branches: refs/heads/master 25e879afc -> f5e430e55 update tests to reflect change of reduce outputs by new API (part-n -> part-r-n): all unit tests pass now Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/0baca7a9 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/0baca7a9 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/0baca7a9 Branch: refs/heads/master Commit: 0baca7a966dd1031c80caa5e8e4a3e855c1f358e Parents: 288dcee Author: Sebastian NagelAuthored: Sun Jan 17 22:20:32 2016 +0100 Committer: Sebastian Nagel Committed: Thu Feb 25 21:26:30 2016 +0100 -- src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java | 2 +- src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java | 2 +- src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java | 2 +- src/test/org/apache/nutch/crawl/TestInjector.java | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/0baca7a9/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java -- diff --git a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java index 86ba76c..56905e4 100644 --- a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java +++ b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java @@ -60,7 +60,7 @@ public class CrawlDBTestUtil { Option wKeyOpt = MapFile.Writer.keyClass(Text.class); org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class); MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir, -"part-0"), wKeyOpt, wValueOpt); +"part-r-0"), wKeyOpt, wValueOpt); Iterator it = init.iterator(); while (it.hasNext()) { URLCrawlDatum row = it.next(); http://git-wip-us.apache.org/repos/asf/nutch/blob/0baca7a9/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java -- diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java index 38c38ed..5c38037 100644 --- a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java @@ -106,7 +106,7 @@ public class TestCrawlDbFilter { job.setOutputValueClass(CrawlDatum.class); JobClient.runJob(job); -Path fetchlist = new Path(new Path(newCrawlDb, "part-0"), "data"); +Path fetchlist = new Path(new Path(newCrawlDb, "part-r-0"), "data"); ArrayList l = readContents(fetchlist); http://git-wip-us.apache.org/repos/asf/nutch/blob/0baca7a9/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java -- diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java b/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java index c800610..b670551 100644 --- a/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java @@ -149,7 +149,7 @@ public class TestCrawlDbMerger { org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class); MapFile.Writer writer = new MapFile.Writer(config, new Path(dir, -"part-0"), wKeyOpt, wValueOpt); +"part-r-0"), wKeyOpt, wValueOpt); Iterator it = init.iterator(); while (it.hasNext()) { String key = it.next(); http://git-wip-us.apache.org/repos/asf/nutch/blob/0baca7a9/src/test/org/apache/nutch/crawl/TestInjector.java -- diff --git a/src/test/org/apache/nutch/crawl/TestInjector.java b/src/test/org/apache/nutch/crawl/TestInjector.java index 135f392..7293cbb 100644 --- a/src/test/org/apache/nutch/crawl/TestInjector.java +++ b/src/test/org/apache/nutch/crawl/TestInjector.java @@ -141,7 +141,7 @@ public class TestInjector { private List readCrawldb() throws IOException { Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME -+ "/part-0/data"); ++ "/part-r-0/data"); System.out.println("reading:" + dbfile); Option rFile = SequenceFile.Reader.file(dbfile); @SuppressWarnings("resource") @@ -161,7 +161,7 @@ public class TestInjector { private HashMap readCrawldbRecords() throws IOException { Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME -+ "/part-0/data"); ++ "/part-r-0/data"); System.out.println("reading:" + dbfile); Option rFile = SequenceFile.Reader.file(dbfile);
[3/5] nutch git commit: NUTCH-1712 applied to current trunk; run first simple tests (inject + merge)
NUTCH-1712 applied to current trunk; run first simple tests (inject + merge) Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/3c691eb2 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/3c691eb2 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/3c691eb2 Branch: refs/heads/master Commit: 3c691eb2823cb85c9ffe95e9212ce7ac0e564709 Parents: 25e879a Author: Sebastian NagelAuthored: Mon Oct 19 21:48:05 2015 +0200 Committer: Sebastian Nagel Committed: Thu Feb 25 21:26:30 2016 +0100 -- src/java/org/apache/nutch/crawl/CrawlDb.java | 19 + src/java/org/apache/nutch/crawl/Injector.java | 599 - 2 files changed, 360 insertions(+), 258 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/3c691eb2/src/java/org/apache/nutch/crawl/CrawlDb.java -- diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java index 053e8fb..1537cdc 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDb.java +++ b/src/java/org/apache/nutch/crawl/CrawlDb.java @@ -28,8 +28,10 @@ import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; +import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.util.*; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; @@ -173,6 +175,23 @@ public class CrawlDb extends NutchTool implements Tool { LockUtil.removeLockFile(fs, lock); } + public static void install(Job job, Path crawlDb) throws IOException { +Configuration conf = job.getConfiguration(); +boolean preserveBackup = conf.getBoolean("db.preserve.backup", true); +FileSystem fs = FileSystem.get(conf); +Path old = new Path(crawlDb, "old"); +Path current = new Path(crawlDb, CURRENT_NAME); +Path tempCrawlDb = org.apache.hadoop.mapreduce.lib.output.FileOutputFormat +.getOutputPath(job); +FSUtils.replace(fs, old, current, true); +FSUtils.replace(fs, current, tempCrawlDb, true); +Path lock = new Path(crawlDb, LOCK_NAME); +LockUtil.removeLockFile(fs, lock); +if (!preserveBackup && fs.exists(old)) { + fs.delete(old, true); +} + } + public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDb(), args); System.exit(res); http://git-wip-us.apache.org/repos/asf/nutch/blob/3c691eb2/src/java/org/apache/nutch/crawl/Injector.java -- diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index dc1f1cf..0d01dc8 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -17,211 +17,267 @@ package org.apache.nutch.crawl; -import java.io.*; -import java.text.SimpleDateFormat; -import java.util.*; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; +import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; -// Commons Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.io.*; -import org.apache.hadoop.fs.*; -import org.apache.hadoop.conf.*; -import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.*; -import org.apache.nutch.net.*; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLFilters; +import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; import org.apache.nutch.util.TimingUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import
[4/5] nutch git commit: NUTCH-1712 Use MultipleInputs in Injector to make it a single mapreduce job, this closes #86
NUTCH-1712 Use MultipleInputs in Injector to make it a single mapreduce job, this closes #86 Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/756f2a1c Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/756f2a1c Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/756f2a1c Branch: refs/heads/master Commit: 756f2a1c88d638f21515ec472088d8f504d12d44 Parents: 0baca7a Author: Sebastian Nagel <sna...@apache.org> Authored: Thu Feb 25 22:04:14 2016 +0100 Committer: Sebastian Nagel <sna...@apache.org> Committed: Thu Feb 25 22:24:45 2016 +0100 -- CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/756f2a1c/CHANGES.txt -- diff --git a/CHANGES.txt b/CHANGES.txt index 71647ee..9b3895c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -10,6 +10,8 @@ in the release announcement and keep it on top in this CHANGES.txt for the Nutch Nutch Change Log +* NUTCH-1712 Use MultipleInputs in Injector to make it a single mapreduce job (tejasp, snagel) + * NUTCH-2231 Jexl support in generator job (markus) * NUTCH-2232 DeduplicationJob should decode URL's before length is compared (Ron van der Vegt via markus)
[2/5] nutch git commit: add unit tests based on MRUnit
add unit tests based on MRUnit Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/288dceed Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/288dceed Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/288dceed Branch: refs/heads/master Commit: 288dceedb7de28457878eecb03a571d082a48cc2 Parents: 3c691eb Author: Sebastian NagelAuthored: Sun Jan 17 21:32:31 2016 +0100 Committer: Sebastian Nagel Committed: Thu Feb 25 21:26:30 2016 +0100 -- ivy/ivy.xml | 10 +- ivy/ivysettings.xml | 2 +- src/java/org/apache/nutch/crawl/Injector.java | 7 +- .../nutch/crawl/CrawlDbUpdateTestDriver.java| 138 +++ .../apache/nutch/crawl/TestCrawlDbStates.java | 7 +- .../org/apache/nutch/crawl/TestInjector.java| 3 +- .../org/apache/nutch/fetcher/TestFetcher.java | 2 +- 7 files changed, 156 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/ivy/ivy.xml -- diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 206cce7..bc8d293 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -11,7 +11,7 @@ OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> - +http://ant.apache.org/ivy/maven;> http://www.apache.org/licenses/LICENSE-2.0.txt/; /> @@ -98,6 +98,10 @@ + + + + @@ -125,9 +129,7 @@ - - - + http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/ivy/ivysettings.xml -- diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml index 0319333..d9b5044 100644 --- a/ivy/ivysettings.xml +++ b/ivy/ivysettings.xml @@ -35,7 +35,7 @@ value="https://repository.apache.org/content/repositories/snapshots/; override="false"/> + value="[organisation]/[module]/[revision]/[module]-[revision](-[classifier])"/> http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/java/org/apache/nutch/crawl/Injector.java -- diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 0d01dc8..383aaf1 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -319,12 +319,13 @@ public class Injector extends NutchTool implements Tool { setConf(conf); } - public void inject(Path crawlDb, Path urlDir) throws Exception { + public void inject(Path crawlDb, Path urlDir) + throws IOException, ClassNotFoundException, InterruptedException { inject(crawlDb, urlDir, false, false); } public void inject(Path crawlDb, Path urlDir, boolean overwrite, - boolean update) throws Exception { + boolean update) throws IOException, ClassNotFoundException, InterruptedException { SimpleDateFormat sdf = new SimpleDateFormat("-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); @@ -397,7 +398,7 @@ public class Injector extends NutchTool implements Tool { LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } -} catch (Exception e) { +} catch (IOException e) { if (fs.exists(tempCrawlDb)) { fs.delete(tempCrawlDb, true); } http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java -- diff --git a/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java b/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java new file mode 100644 index 000..7238f88 --- /dev/null +++ b/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or
svn commit: r1726314 - in /nutch/trunk: CHANGES.txt conf/regex-normalize.xml.template ivy/ivy.xml
Author: snagel Date: Fri Jan 22 21:26:12 2016 New Revision: 1726314 URL: http://svn.apache.org/viewvc?rev=1726314=rev Log: NUTCH-2204 Remove junit lib from runtime Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/regex-normalize.xml.template nutch/trunk/ivy/ivy.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1726314=1726313=1726314=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jan 22 21:26:12 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2204 Remove junit lib from runtime (snagel) + * NUTCH-2201 Remove loops program from webgraph package (markus) * NUTCH-1325 HostDB for Nutch (Gui Forget, markus, tejasp) Modified: nutch/trunk/conf/regex-normalize.xml.template URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1726314=1726313=1726314=diff == --- nutch/trunk/conf/regex-normalize.xml.template (original) +++ nutch/trunk/conf/regex-normalize.xml.template Fri Jan 22 21:26:12 2016 @@ -39,11 +39,12 @@ /$3 --> - + Modified: nutch/trunk/ivy/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1726314=1726313=1726314=diff == --- nutch/trunk/ivy/ivy.xml (original) +++ nutch/trunk/ivy/ivy.xml Fri Jan 22 21:26:12 2016 @@ -92,6 +92,7 @@ +
nutch git commit: Inconsistent log level
Repository: nutch Updated Branches: refs/heads/master 6d2bfa986 -> 0e03daf11 Inconsistent log level Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/0e03daf1 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/0e03daf1 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/0e03daf1 Branch: refs/heads/master Commit: 0e03daf1139a1a8465d6e2a6b54490e2dfc2a9ef Parents: 6d2bfa9 Author: Sebastian Nagel <sna...@apache.org> Authored: Fri Apr 29 18:33:04 2016 +0200 Committer: Sebastian Nagel <sna...@apache.org> Committed: Fri Apr 29 18:33:04 2016 +0200 -- CHANGES.txt | 2 ++ src/java/org/apache/nutch/fetcher/FetcherThread.java | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/0e03daf1/CHANGES.txt -- diff --git a/CHANGES.txt b/CHANGES.txt index 6173134..436db07 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -10,6 +10,8 @@ in the release announcement and keep it on top in this CHANGES.txt for the Nutch Nutch Change Log +* NUTCH-2256 Inconsistent log level (songwanging via snagel) + * NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 (Federico Bonelli, snagel) * NUTCH-2250 CommonCrawlDumper : Invalid format and skipped parts (Thamme Gowda N.,lewismc via mattmann) http://git-wip-us.apache.org/repos/asf/nutch/blob/0e03daf1/src/java/org/apache/nutch/fetcher/FetcherThread.java -- diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 09315a7..e57e735 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -286,7 +286,7 @@ public class FetcherThread extends Thread { .getFetchItemQueue(fit.queueID); fiq.crawlDelay = rules.getCrawlDelay(); if (LOG.isDebugEnabled()) { - LOG.info("Crawl delay for queue: " + fit.queueID + LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url); }
nutch git commit: NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 - generate base64 encoded string directly from content bytes (patch provided by Federico Bonelli) - add JUn
Repository: nutch Updated Branches: refs/heads/master 8572fd955 -> 6d2bfa986 NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 - generate base64 encoded string directly from content bytes (patch provided by Federico Bonelli) - add JUnit test to test indexing base64 encoded binary content with UTF-8, ISO-8859-1 and ISO-8859-2 character sets Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6d2bfa98 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6d2bfa98 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6d2bfa98 Branch: refs/heads/master Commit: 6d2bfa98635d8055d56dbe2597efc953f420ed5a Parents: 8572fd9 Author: Sebastian Nagel <sna...@apache.org> Authored: Mon Apr 25 14:40:44 2016 +0200 Committer: Sebastian Nagel <sna...@apache.org> Committed: Wed Apr 27 22:49:47 2016 +0200 -- CHANGES.txt | 2 + .../apache/nutch/indexer/IndexerMapReduce.java | 12 +- .../apache/nutch/indexer/NutchIndexAction.java | 3 + .../nutch/indexer/TestIndexerMapReduce.java | 187 +++ 4 files changed, 198 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/CHANGES.txt -- diff --git a/CHANGES.txt b/CHANGES.txt index e14d7c5..6173134 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -10,6 +10,8 @@ in the release announcement and keep it on top in this CHANGES.txt for the Nutch Nutch Change Log +* NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 (Federico Bonelli, snagel) + * NUTCH-2250 CommonCrawlDumper : Invalid format and skipped parts (Thamme Gowda N.,lewismc via mattmann) * NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine Similarity Model (bhavyasanghavi via sujen) http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/java/org/apache/nutch/indexer/IndexerMapReduce.java -- diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java index 1d5f66f..5025525 100644 --- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java +++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java @@ -350,14 +350,14 @@ public class IndexerMapReduce extends Configured implements } if (content != null) { - // Get the original unencoded content - String binary = new String(content.getContent()); - - // optionally encode as base64 + // Add the original binary content + String binary; if (base64) { -binary = Base64.encodeBase64String(StringUtils.getBytesUtf8(binary)); +// optionally encode as base64 +binary = Base64.encodeBase64String(content.getContent()); + } else { +binary = new String(content.getContent()); } - doc.add("binaryContent", binary); } http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/java/org/apache/nutch/indexer/NutchIndexAction.java -- diff --git a/src/java/org/apache/nutch/indexer/NutchIndexAction.java b/src/java/org/apache/nutch/indexer/NutchIndexAction.java index 679d784..b2517c3 100644 --- a/src/java/org/apache/nutch/indexer/NutchIndexAction.java +++ b/src/java/org/apache/nutch/indexer/NutchIndexAction.java @@ -37,6 +37,9 @@ public class NutchIndexAction implements Writable { public NutchDocument doc = null; public byte action = ADD; + protected NutchIndexAction() { + } + public NutchIndexAction(NutchDocument doc, byte action) { this.doc = doc; this.action = action; http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java -- diff --git a/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java b/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java new file mode 100644 index 000..d581a0f --- /dev/null +++ b/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java @@ -0,0 +1,187 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writ
nutch git commit: Inconsistent log level
Repository: nutch Updated Branches: refs/heads/2.x 9e7c0e6fa -> 1fc254e5e Inconsistent log level Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/1fc254e5 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/1fc254e5 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/1fc254e5 Branch: refs/heads/2.x Commit: 1fc254e5eb68f40f66911ba9854d20c0fea88fc9 Parents: 9e7c0e6 Author: Sebastian Nagel <sna...@apache.org> Authored: Fri Apr 29 18:46:04 2016 +0200 Committer: Sebastian Nagel <sna...@apache.org> Committed: Fri Apr 29 18:46:04 2016 +0200 -- CHANGES.txt | 2 ++ src/java/org/apache/nutch/fetcher/FetcherReducer.java | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/1fc254e5/CHANGES.txt -- diff --git a/CHANGES.txt b/CHANGES.txt index 0a20a98..b7f1345 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,8 @@ Nutch Change Log Nutch 2.4 Development + * NUTCH-2256 Inconsistent log level (songwanging via snagel) + * NUTCH-961 GitHub-92 Add the boilerpipe parsing adapted from NUTCH-961 (Jeremie Bourseaux <jeremie.bours...@xilopix.com> via mattmann) * GitHub-94 Fix the issue of the bad timestamp. (Jeremie Bourseaux <jeremie.bours...@xilopix.com> via mattmann) http://git-wip-us.apache.org/repos/asf/nutch/blob/1fc254e5/src/java/org/apache/nutch/fetcher/FetcherReducer.java -- diff --git a/src/java/org/apache/nutch/fetcher/FetcherReducer.java b/src/java/org/apache/nutch/fetcher/FetcherReducer.java index 00860b6..8ee7477 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherReducer.java +++ b/src/java/org/apache/nutch/fetcher/FetcherReducer.java @@ -522,7 +522,7 @@ public class FetcherReducer extends .getFetchItemQueue(fit.queueID); fiq.crawlDelay = rules.getCrawlDelay(); if (LOG.isDebugEnabled()) { - LOG.info("Crawl delay for queue: " + fit.queueID + LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url); }
nutch git commit: fix for NUTCH-2191 - fixing Nutch build - contributed by karanjeets
Repository: nutch Updated Branches: refs/heads/master 044e8e77e -> 8572fd955 fix for NUTCH-2191 - fixing Nutch build - contributed by karanjeets Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/8572fd95 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/8572fd95 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/8572fd95 Branch: refs/heads/master Commit: 8572fd9551b430f31a4fdace14738f2d9959b370 Parents: 044e8e7 Author: Karanjeet SinghAuthored: Mon Apr 18 00:45:37 2016 -0700 Committer: Karanjeet Singh Committed: Mon Apr 18 00:45:37 2016 -0700 -- src/plugin/protocol-htmlunit/build.xml | 9 - .../nutch/protocol/htmlunit/HttpResponse.java | 408 ++- 2 files changed, 317 insertions(+), 100 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/8572fd95/src/plugin/protocol-htmlunit/build.xml -- diff --git a/src/plugin/protocol-htmlunit/build.xml b/src/plugin/protocol-htmlunit/build.xml index bf695fe..899214c 100644 --- a/src/plugin/protocol-htmlunit/build.xml +++ b/src/plugin/protocol-htmlunit/build.xml @@ -34,13 +34,4 @@ - - - - - - - - - http://git-wip-us.apache.org/repos/asf/nutch/blob/8572fd95/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java -- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java index 7242f40..8b1a031 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java @@ -20,11 +20,18 @@ import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.EOFException; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.io.PushbackInputStream; import java.net.InetSocketAddress; import java.net.Socket; import java.net.URL; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import javax.net.ssl.SSLSocket; +import javax.net.ssl.SSLSocketFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; @@ -35,46 +42,78 @@ import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.http.api.HttpBase; import org.apache.nutch.protocol.http.api.HttpException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +/** + * An HTTP response. + */ public class HttpResponse implements Response { - private static final Logger LOG = LoggerFactory.getLogger(HttpResponse.class); - - private Http http; + private Configuration conf; + private HttpBase http; private URL url; + private String orig; + private String base; private byte[] content; private int code; private Metadata headers = new SpellCheckedMetadata(); + // used for storing the http headers verbatim + private StringBuffer httpHeaders; - /** The nutch configuration */ - private Configuration conf = null; + protected enum Scheme { +HTTP, HTTPS, + } - public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException { + /** + * Default public constructor. + * + * @param http + * @param url + * @param datum + * @throws ProtocolException + * @throws IOException + */ + public HttpResponse(HttpBase http, URL url, CrawlDatum datum) + throws ProtocolException, IOException { -this.conf = http.getConf(); this.http = http; this.url = url; +this.orig = url.toString(); +this.base = url.toString(); + +Scheme scheme = null; + +if ("http".equals(url.getProtocol())) { + scheme = Scheme.HTTP; +} else if ("https".equals(url.getProtocol())) { + scheme = Scheme.HTTPS; +} else { + throw new HttpException("Unknown scheme (not http/https) for url:" + url); +} + +if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("fetching " + url); +} -LOG.info("fetching {}", url); - String path = "".equals(url.getFile()) ? "/" : url.getFile(); // some servers will redirect a request with a host line like // "Host: :80" to "http:///"- they // don't want the :80... + String host = url.getHost(); int port; String portString; if (url.getPort() == -1) { - port = 80; + if (scheme == Scheme.HTTP) { +port = 80; + } else { +
[2/2] nutch git commit: NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html - fix broken unit test (fix HTML markup, make test for meta data extraction obligatory) -
NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html - fix broken unit test (fix HTML markup, make test for meta data extraction obligatory) - add all values of general metadata to parse metadata Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/34050ada Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/34050ada Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/34050ada Branch: refs/heads/master Commit: 34050adae0896a6d7ddb254a1622a03af6e07175 Parents: c18e19b Author: Sebastian NagelAuthored: Fri Jul 1 15:07:52 2016 +0200 Committer: Sebastian Nagel Committed: Fri Jul 1 15:10:49 2016 +0200 -- .../org/apache/nutch/metadata/Metadata.java | 25 .../org/apache/nutch/parse/html/HtmlParser.java | 4 +--- .../apache/nutch/parse/html/TestHtmlParser.java | 11 - 3 files changed, 31 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/java/org/apache/nutch/metadata/Metadata.java -- diff --git a/src/java/org/apache/nutch/metadata/Metadata.java b/src/java/org/apache/nutch/metadata/Metadata.java index f0bfcd3..8a57ee3 100644 --- a/src/java/org/apache/nutch/metadata/Metadata.java +++ b/src/java/org/apache/nutch/metadata/Metadata.java @@ -123,6 +123,31 @@ public class Metadata implements Writable, CreativeCommons, DublinCore, } /** + * Add all name/value mappings (merge two metadata mappings). If a name + * already exists in current metadata the values are added to existing values. + * + * @param metadata + * other Metadata to be merged + */ + public void addAll(Metadata metadata) { +for (String name : metadata.names()) { + String[] addValues = metadata.getValues(name); + if (addValues == null) +continue; + String[] oldValues = this.metadata.get(name); + if (oldValues == null) { +this.metadata.put(name, addValues); + } else { +String[] newValues = new String[oldValues.length + addValues.length]; +System.arraycopy(oldValues, 0, newValues, 0, oldValues.length); +System.arraycopy(addValues, 0, newValues, oldValues.length, +addValues.length); +this.metadata.put(name, newValues); + } +} + } + + /** * Copy All key-value pairs from properties. * * @param properties http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java -- diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java index baa..4d043ba 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -183,9 +183,7 @@ public class HtmlParser implements Parser { HTMLMetaProcessor.getMetaTags(metaTags, root, base); // populate Nutch metadata with HTML meta directives -for (String name : metaTags.getGeneralTags().names()) { - metadata.add(name, metaTags.getGeneralTags().get(name)); -} +metadata.addAll(metaTags.getGeneralTags()); if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java -- diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java index bcfe9e4..7099f50 100644 --- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java +++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java @@ -40,8 +40,8 @@ public class TestHtmlParser { private static final String encodingTestBody = "\n français\n español\n ÑÑÑÑкий ÑзÑк\n ÄeÅ¡tina\n ελληνικά\n"; private static final String encodingTestContent = "" + encodingTestKeywords + "\n" - + "\n" + "\n" + encodingTestBody + "\n"; + + "\n" + + "\n" + encodingTestBody + "\n"; private static String[][] encodingTestPages = { { @@ -113,10 +113,9 @@ public class TestHtmlParser { Assert.assertTrue(keyword + " not found in text (" + name + ")", text.contains(keyword)); } - if (keywords != null) { -Assert.assertEquals("Keywords not
[1/2] nutch git commit: NUTCH-2291 - Fix mrunit dependencies - remove classifier from dependency because pom file name on Maven repository does not contain a classifier
Repository: nutch Updated Branches: refs/heads/master cb6fbae51 -> 34050adae NUTCH-2291 - Fix mrunit dependencies - remove classifier from dependency because pom file name on Maven repository does not contain a classifier Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/c18e19bf Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/c18e19bf Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/c18e19bf Branch: refs/heads/master Commit: c18e19bfe63c3ac5221d1a0f454b9e1a037a4386 Parents: cb6fbae Author: Sebastian NagelAuthored: Fri Jul 1 14:45:41 2016 +0200 Committer: Sebastian Nagel Committed: Fri Jul 1 14:45:41 2016 +0200 -- ivy/ivy.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/c18e19bf/ivy/ivy.xml -- diff --git a/ivy/ivy.xml b/ivy/ivy.xml index a4e9481..a9a83ae 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -97,7 +97,7 @@ - +
[3/4] nutch git commit: CrawlDb statistics: add fetch time (earliest, latest, average)
CrawlDb statistics: add fetch time (earliest, latest, average) Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/ea2843b9 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/ea2843b9 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/ea2843b9 Branch: refs/heads/master Commit: ea2843b9be6569e17963031d7370f5db42261809 Parents: 6b141fb Author: Sebastian NagelAuthored: Mon Jun 20 14:42:04 2016 +0200 Committer: Sebastian Nagel Committed: Sat Jul 2 12:06:04 2016 +0200 -- .../org/apache/nutch/crawl/CrawlDbReader.java | 76 1 file changed, 46 insertions(+), 30 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/ea2843b9/src/java/org/apache/nutch/crawl/CrawlDbReader.java -- diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index 8f42ac4..381cec5 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -197,6 +197,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1); output.collect(new Text("s"), new LongWritable( (long) (value.getScore() * 1000.0))); + output.collect(new Text("f"), new LongWritable(value.getFetchTime())); if (sort) { URL u = new URL(key.toString()); String host = u.getHost(); @@ -219,32 +220,40 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { public void close() { } +private void reduceMinMaxTotal(String keyPrefix, Iterator values, +OutputCollector output, Reporter reporter) +throws IOException { + long total = 0; + long min = Long.MAX_VALUE; + long max = Long.MIN_VALUE; + while (values.hasNext()) { +LongWritable cnt = values.next(); +if (cnt.get() < min) + min = cnt.get(); +if (cnt.get() > max) + max = cnt.get(); +total += cnt.get(); + } + output.collect(new Text(keyPrefix+"n"), new LongWritable(min)); + output.collect(new Text(keyPrefix+"x"), new LongWritable(max)); + output.collect(new Text(keyPrefix+"t"), new LongWritable(total)); +} + public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { val.set(0L); String k = key.toString(); - if (!k.equals("s")) { + if (k.equals("s")) { +reduceMinMaxTotal("sc", values, output, reporter); + } else if (k.equals("f")) { +reduceMinMaxTotal("ft", values, output, reporter); + } else { while (values.hasNext()) { LongWritable cnt = values.next(); val.set(val.get() + cnt.get()); } output.collect(key, val); - } else { -long total = 0; -long min = Long.MAX_VALUE; -long max = Long.MIN_VALUE; -while (values.hasNext()) { - LongWritable cnt = values.next(); - if (cnt.get() < min) -min = cnt.get(); - if (cnt.get() > max) -max = cnt.get(); - total += cnt.get(); -} -output.collect(new Text("scn"), new LongWritable(min)); -output.collect(new Text("scx"), new LongWritable(max)); -output.collect(new Text("sct"), new LongWritable(total)); } } } @@ -277,7 +286,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(cnt.get() + val.get()); } output.collect(key, cnt); - } else if (k.equals("scx")) { + } else if (k.equals("scx") || k.equals("ftx")) { LongWritable cnt = new LongWritable(Long.MIN_VALUE); while (values.hasNext()) { LongWritable val = values.next(); @@ -285,7 +294,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(val.get()); } output.collect(key, cnt); - } else if (k.equals("scn")) { + } else if (k.equals("scn") || k.equals("ftn")) { LongWritable cnt = new LongWritable(Long.MAX_VALUE); while (values.hasNext()) { LongWritable val = values.next(); @@ -293,7 +302,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(val.get()); } output.collect(key, cnt); - } else if (k.equals("sct")) { + } else if (k.equals("sct") || k.equals("ftt")) { LongWritable cnt = new LongWritable(); while (values.hasNext()) {
[2/4] nutch git commit: CrawlDb statistics: add fetch interval (shortest, longest, average)
CrawlDb statistics: add fetch interval (shortest, longest, average) Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/39f6c713 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/39f6c713 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/39f6c713 Branch: refs/heads/master Commit: 39f6c713974240d19d54a515cd04372878739456 Parents: ea2843b Author: Sebastian NagelAuthored: Wed Jun 22 16:22:33 2016 +0200 Committer: Sebastian Nagel Committed: Sat Jul 2 12:06:04 2016 +0200 -- .../org/apache/nutch/crawl/CrawlDbReader.java | 35 - src/java/org/apache/nutch/util/TimingUtil.java | 53 2 files changed, 55 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/39f6c713/src/java/org/apache/nutch/crawl/CrawlDbReader.java -- diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index 381cec5..3cf6ff3 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -69,6 +69,7 @@ import org.apache.nutch.util.JexlUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; +import org.apache.nutch.util.TimingUtil; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; import org.apache.commons.lang.time.DateUtils; @@ -195,9 +196,10 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { output.collect(new Text("status " + value.getStatus()), COUNT_1); output .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1); - output.collect(new Text("s"), new LongWritable( + output.collect(new Text("sc"), new LongWritable( (long) (value.getScore() * 1000.0))); - output.collect(new Text("f"), new LongWritable(value.getFetchTime())); + output.collect(new Text("ft"), new LongWritable(value.getFetchTime())); + output.collect(new Text("fi"), new LongWritable(value.getFetchInterval())); if (sort) { URL u = new URL(key.toString()); String host = u.getHost(); @@ -244,10 +246,8 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { throws IOException { val.set(0L); String k = key.toString(); - if (k.equals("s")) { -reduceMinMaxTotal("sc", values, output, reporter); - } else if (k.equals("f")) { -reduceMinMaxTotal("ft", values, output, reporter); + if (k.equals("sc") || k.equals("ft") || k.equals("fi")) { +reduceMinMaxTotal(k, values, output, reporter); } else { while (values.hasNext()) { LongWritable cnt = values.next(); @@ -286,7 +286,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(cnt.get() + val.get()); } output.collect(key, cnt); - } else if (k.equals("scx") || k.equals("ftx")) { + } else if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) { LongWritable cnt = new LongWritable(Long.MIN_VALUE); while (values.hasNext()) { LongWritable val = values.next(); @@ -294,7 +294,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(val.get()); } output.collect(key, cnt); - } else if (k.equals("scn") || k.equals("ftn")) { + } else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) { LongWritable cnt = new LongWritable(Long.MAX_VALUE); while (values.hasNext()) { LongWritable val = values.next(); @@ -302,7 +302,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(val.get()); } output.collect(key, cnt); - } else if (k.equals("sct") || k.equals("ftt")) { + } else if (k.equals("sct") || k.equals("ftt") || k.equals("fit")) { LongWritable cnt = new LongWritable(); while (values.hasNext()) { LongWritable val = values.next(); @@ -402,16 +402,16 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { LongWritable val = stats.get(k); if (val == null) { val = new LongWritable(); - if (k.equals("scx") || k.equals("ftx")) + if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) val.set(Long.MIN_VALUE); - if (k.equals("scn") ||
[1/2] nutch git commit: Remove obsolete properties protocol.plugin.check.blocking and protocol.plugin.check.robots
Repository: nutch Updated Branches: refs/heads/master d27c351f4 -> d37b7ce13 Remove obsolete properties protocol.plugin.check.blocking and protocol.plugin.check.robots Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/070a637b Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/070a637b Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/070a637b Branch: refs/heads/master Commit: 070a637babedc324948c0c58b333668bab6b813d Parents: d27c351 Author: Sebastian NagelAuthored: Mon Aug 15 11:19:46 2016 +0200 Committer: Sebastian Nagel Committed: Mon Aug 15 11:19:46 2016 +0200 -- src/java/org/apache/nutch/fetcher/Fetcher.java | 4 src/java/org/apache/nutch/protocol/Protocol.java | 18 -- 2 files changed, 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/070a637b/src/java/org/apache/nutch/fetcher/Fetcher.java -- diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index aad9ee9..e60b10f 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -209,10 +209,6 @@ MapRunnable { feeder.setTimeLimit(timelimit); feeder.start(); -// set non-blocking & no-robots mode for HTTP protocol plugins. -getConf().setBoolean(Protocol.CHECK_BLOCKING, false); -getConf().setBoolean(Protocol.CHECK_ROBOTS, false); - for (int i = 0; i < threadCount; i++) { // spawn threads FetcherThread t = new FetcherThread(getConf(), getActiveThreads(), fetchQueues, feeder, spinWaiting, lastRequestStart, reporter, errors, segmentName, http://git-wip-us.apache.org/repos/asf/nutch/blob/070a637b/src/java/org/apache/nutch/protocol/Protocol.java -- diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java index 0aa5d29..efd0100 100755 --- a/src/java/org/apache/nutch/protocol/Protocol.java +++ b/src/java/org/apache/nutch/protocol/Protocol.java @@ -33,24 +33,6 @@ public interface Protocol extends Pluggable, Configurable { public final static String X_POINT_ID = Protocol.class.getName(); /** - * Property name. If in the current configuration this property is set to - * true, protocol implementations should handle "politeness" limits - * internally. If this is set to false, it is assumed that these limits are - * enforced elsewhere, and protocol implementations should not enforce them - * internally. - */ - public final static String CHECK_BLOCKING = "protocol.plugin.check.blocking"; - - /** - * Property name. If in the current configuration this property is set to - * true, protocol implementations should handle robot exclusion rules - * internally. If this is set to false, it is assumed that these limits are - * enforced elsewhere, and protocol implementations should not enforce them - * internally. - */ - public final static String CHECK_ROBOTS = "protocol.plugin.check.robots"; - - /** * Returns the {@link Content} for a fetchlist entry. */ ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum);
[2/2] nutch git commit: Merge branch 'NUTCH-2299' of https://github.com/sebastian-nagel/nutch this closes #140 - Remove obsolete properties protocol.plugin.check.*
Merge branch 'NUTCH-2299' of https://github.com/sebastian-nagel/nutch this closes #140 - Remove obsolete properties protocol.plugin.check.* Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/d37b7ce1 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/d37b7ce1 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/d37b7ce1 Branch: refs/heads/master Commit: d37b7ce13ee82f0a7d1388f87c2be5d636e425aa Parents: d27c351 070a637 Author: Sebastian NagelAuthored: Tue Aug 16 20:43:01 2016 +0200 Committer: Sebastian Nagel Committed: Tue Aug 16 20:43:01 2016 +0200 -- src/java/org/apache/nutch/fetcher/Fetcher.java | 4 src/java/org/apache/nutch/protocol/Protocol.java | 18 -- 2 files changed, 22 deletions(-) --
nutch git commit: NUTCH-2349 urlnormalizer-basic: NPE for URLs without authority - check whether URL.getAuthority() returns null - recompose URLs without authority with empty authority/host
Repository: nutch Updated Branches: refs/heads/2.x 022ed5c03 -> 700857d16 NUTCH-2349 urlnormalizer-basic: NPE for URLs without authority - check whether URL.getAuthority() returns null - recompose URLs without authority with empty authority/host Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/700857d1 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/700857d1 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/700857d1 Branch: refs/heads/2.x Commit: 700857d16c9e1517ddb9868ed41171d91e5c9116 Parents: 022ed5c Author: Sebastian NagelAuthored: Wed Feb 1 11:51:04 2017 +0100 Committer: Sebastian Nagel Committed: Wed Feb 1 11:51:04 2017 +0100 -- .../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java | 5 - .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java | 6 ++ 2 files changed, 10 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/700857d1/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java -- diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index e17b19a..15a1de0 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -79,7 +79,7 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { if ("http".equals(protocol) || "https".equals(protocol) || "ftp".equals(protocol)) { - if (host != null) { + if (host != null && url.getAuthority() != null) { String newHost = host.toLowerCase(Locale.ROOT); // lowercase host if (!host.equals(newHost)) { host = newHost; @@ -89,6 +89,9 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { // etc.) which will likely cause a change if left away changed = true; } + } else { +// no host or authority: recompose the URL from components +changed = true; } if (port == url.getDefaultPort()) { // uses default port http://git-wip-us.apache.org/repos/asf/nutch/blob/700857d1/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java -- diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index 006c1a3..1d5d99e 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -100,6 +100,12 @@ public class TestBasicURLNormalizer { "http://foo.com/aa/bb/foo.html;); normalizeTest("http://foo.com/aa?referer=http://bar.com;, "http://foo.com/aa?referer=http://bar.com;); +// check for NPEs when normalizing URLs without host (authority) +normalizeTest("file:///foo/bar.txt", "file:///foo/bar.txt"); +normalizeTest("ftp:/", "ftp:/"); +normalizeTest("http:", "http:/"); +normalizeTest("http:;, "http:/"); +normalizeTest("http:///;, "http:/"); } private void normalizeTest(String weird, String normal) throws Exception {