svn commit: r894716 - in /lucene/nutch/trunk: site/credits.html site/credits.pdf src/site/src/documentation/content/xdocs/credits.xml
Author: jnioche Date: Wed Dec 30 21:34:28 2009 New Revision: 894716 URL: http://svn.apache.org/viewvc?rev=894716&view=rev Log: Adding J. Nioche to the list of committers Modified: lucene/nutch/trunk/site/credits.html lucene/nutch/trunk/site/credits.pdf lucene/nutch/trunk/src/site/src/documentation/content/xdocs/credits.xml Modified: lucene/nutch/trunk/site/credits.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.html?rev=894716&r1=894715&r2=894716&view=diff == --- lucene/nutch/trunk/site/credits.html (original) +++ lucene/nutch/trunk/site/credits.html Wed Dec 30 21:34:28 2009 @@ -252,6 +252,10 @@ +http://www.digitalpebble.com/";>Julien Nioche + + + http://people.apache.org/~siren";>Sami Siren @@ -261,7 +265,7 @@ - + Friends @@ -292,7 +296,7 @@ - + Sponsors Modified: lucene/nutch/trunk/site/credits.pdf URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.pdf?rev=894716&r1=894715&r2=894716&view=diff == --- lucene/nutch/trunk/site/credits.pdf (original) +++ lucene/nutch/trunk/site/credits.pdf Wed Dec 30 21:34:28 2009 @@ -58,10 +58,10 @@ >> endobj 14 0 obj -<< /Length 2451 /Filter [ /ASCII85Decode /FlateDecode ] +<< /Length 2523 /Filter [ /ASCII85Decode /FlateDecode ] >> stream -Gat=-?#uMo'RekG6)QN,j<4X4IDDVKe8Cd99l3ZjVX=Q[b.BLBhuXiFrUmi*,VH<1...@__ei(`FER#:35&J-3KkLmIc0$E/-9at+C5'JL_g:M-`ZXIRr>cMV(?8=o8Z43rXNS`.lKsK^`(anX=FV;m0$Sh;[&*WTGKJTabq4PB&nG%HT1<]HqfD`^fK]+DM"GC][A;AauDhV\=BF-6%+--+,-R<^Q`J-rq1^/aI!E7A77`*g/j,2T[+;*_3p]F4O:C]]NLjH]*W>a3p$E[tVeq&1,Q9`X>&AoU`^21U3M3(8Y,Y"8+d:rLd/?E>rprgkLDtPtOH9;iBRt,/sV1!"^kLNh#W?Xc;Z=^0N*TO_9#QGEfj)K%-/N_]VP#g)tHj?tS>b1CVc1X8#u7>Ig^0>aenmP)fY)!'l;.LN!;sR5V8_qK>q?BssHf[1`%[p=)^/[j['-n/*i...@j8c=+o1v+oasfe;<&/X[k+:G!Fa`7,=6%UB(je=+0Jk]pJuk0$Ful&w0jqci.wtfp?...@jip'*(4IN(nFWUC'4$+ASc0$'Hb0).HOX.nYPOI'ZfjA4lGoAq.OB7pr*@;,&dS]W^Y".".TVojXN,X0'^8csme]rov...@mrm=&7Ume#n129mnp^<[\n!"AiTa(oU;!=fZ7KED][iPuL2sG4B2QRR;R8h]pal\kC5TJ`i!B1dgG;M/ZcccB^2R[RqCf=sJoXH_\a#TIr0[=g'YE<]#JSS-7VY96Wr.ok]i=r\uB18i+V7Ss'SGHVU[Qb5/Q5TWMK3h(]eqes3?1^...@ad(So.oH1',$NU^Vd0*s7V-7,/:Z[3MHCgmf...@aa$iphpn0^:5L1ee7)3\(au1D0)OB\]4nLeh+trE:]>o[ep'HMp0S"6f*#u`m_=1)$r+/?0fr%8)ZF4M9-8$qiO8pl&$RTRX&;'t5i!`R=2hX%*)iE^Beb>'hN'B_]>]Q9K$mCB;OKPjEB#qBR8FHam6Gc+o6UEhb-kk2j:)X;T.C08B"Mf2l]$XQn0D%X<)b'213CgRMXW+^tAD'KI>J5,Sk/mc7!*...@$xk*%*p=z.(>r-Vcj#&EO>rcOd"WY'T"@sLig2Y'taK-,a-qM[rFHonKAtmbo1ejl28\D7UmR'edeKrfRJNN6G'FB5XBB=$3rGK9B(0DX,]nr`d...@`-q)"pJ?'%5m-cho3$21...@l!4?0y>ck3^lHrB4+UOCPg#tMPmHhVcMDK0f<-5e+uxw]j#...@qs-%sbew.jf15ga\^<`,1S+C=C-df;1RW])_L(N8X,+O2I;PjoBEdq"j[4Bb`o8s'nn'=NZ0ZN-f]T05Hk/tG(2[H&)qDr>NDb0!qqU^G<'Zp4:7+dW)^0sm#,\V;7V0cjr1p)f%/qYmj-Z?$AUlBL1%XlNB'tbd)?+Q^H,QmfH2%'N%_oPR?e$KQgk$dJLFe)cW2K6.Q%P8Zo1k"RAhX>59n?j...@mez0l[inp$=f$knplk[)_>J=psS7?oce?FP[7&Pe_$&g8ck",p<+RDZPcD%]pJJI>,k...@j3h?p5n:o%&3krIkL#pRsD#!?h]F0FGL\q2oJ"5:7u3j.f.!$/5=L,M8k5bmiXYO9t>D%($\s6U-#QdabpH>@gbM8BONu7"[aH:N\^fLQb6,C?YISJ#LRlDar#mfYU4Fk*_:IRVLV?eD)?r#-0;hW[7M/pHR5ZLdq~> +Gat=.gN&fR&:NH>+I/skrF!LtiR*u_M./K*UroO.4S$1Q#_\,o.Cdrh[?:`3(CDk$A_c#$'6^EnKoo'=DV"P[cj(1`Z!/GLqMlI'_P>Z=j'(/nk...@fmq3\jo#r1]&S]l-i",?Ph-``Gi-s;WK:i$e4Z``]21SYbbc$nE5rp]<=[@(H\rE*[q"&O7#Ynt4#%'?4'qE"87L)qbI0Jpm,!pItf#<5'$l$ec7b_j&,l[#2co1DBg(Z_cUknM!=eoou>&HLfcsp#HaX$g%+;k.b)A5`W!&ateTp>Ht/+0leNc/VuN[:mh^84EM8?!]/Z6e'(ch"95enOV7h.L'?p:(esRj9'XYQ4`BF1r1#H<(tG6F0S8$DNcU*5f)&Y$ fZ/@DdNgr>_D,g...@iup?phkyk5$kx!m0_fthr=hu;NN&]i8ld]!...@w6z>K7VWl)3...@j#c-hr;/KM[rdlSS&mT)'0cICU8Vcj.Z34NK=H_.m[?V#TJAdWN5eO'^tCpoT[8uf:>*8=Tp(GA]lQV]eR7Fi5"8'j3QEXYqBXrAWRK:p%nl-MR`5[BNc071,lDD.#t<+lT3s$D,)q44Do57+bTjq6RhRmfKMfp3f5<9e_iL;B<3U2$i=aO@@F0R_HkVG4]HK'Pf&WA`di`57QZI+)4...@n,#0#!]0&'oZ\sqoj?(/4NSg6MNC%ic)NLOh5$fQ9_Xf3Q?D37nT]ukQZ'fEd;8bYhZFL6Cc=Ac:6:'.S6kGu^5[tF9\IIri=cO7?r=mMb49P+#BZ^mrkc9B*B0)oZnYHkiG*P=[mO]lN+&&NISUnlgCh8l*/ZRCj`TCTDfi]S4Q8JEbKVB&%n+U$9p:*q*"aCnLBmVaka&#DC74u0o_;2rV,BoU0_m-Fu`+QVuUtbfUk/`K::_m[*:=LC=8j<%_%TZs4ecoCc1(kBo^O[B#n[DE"U&@a1="7#s5doSj<5,Ed*.i:YC+U#;?2=t2\ql-tZFnM?9&fQ?P3\jDgL*-R(N)X<3B.2?.Y%$:Z&<9p...@a3)OGT/vmtdajse...@2/%Fjf(DDn14:Vq6WNIQ4hnfCH9''p!^_aPj*4Jgo3uq6:9B7U*W;or
svn commit: r895972 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java src/java/org/apache/nutch/parse/ParseSegment.java src/java/org/apache/nutch/protocol/ProtocolSt
Author: jnioche Date: Tue Jan 5 10:14:49 2010 New Revision: 895972 URL: http://svn.apache.org/viewvc?rev=895972&view=rev Log: NUTCH-658 : Add Counter for # of doc fetched in Reporter Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=895972&r1=895971&r2=895972&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Jan 5 10:14:49 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-658 Use counters to report fetching and parsing status (jnioche) + * NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann) * NUTCH-767 Update Tika to v0.5 for the MimeType detection (Julien Nioche via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=895972&r1=895971&r2=895972&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Jan 5 10:14:49 2010 @@ -607,6 +607,7 @@ LOG.debug("Denied by robots.txt: " + fit.url); } output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); +reporter.incrCounter("FetcherStatus", "robots_denied", 1); continue; } if (rules.getCrawlDelay() > 0) { @@ -615,6 +616,7 @@ fetchQueues.finishFetchItem(fit, true); LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping"); output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); + reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1); continue; } else { FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID); @@ -630,6 +632,8 @@ String urlString = fit.url.toString(); + reporter.incrCounter("FetcherStatus", status.getName(), 1); + switch(status.getCode()) { case ProtocolStatus.WOULDBLOCK: @@ -664,6 +668,7 @@ } else { // stop redirecting redirecting = false; + reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); } } } @@ -701,6 +706,7 @@ } else { // stop redirecting redirecting = false; +reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); } } else { // stop redirecting @@ -926,6 +932,7 @@ if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { + reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1); return p.getData().getStatus(); } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=895972&r1=895971&r2=895972&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue Jan 5 10:14:49 2010 @@ -93,6 +93,8 @@ Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); + reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[parseStatus.getMajorCode()], 1); + if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(getConf()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
svn commit: r896539 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Injector.java
Author: jnioche Date: Wed Jan 6 17:01:51 2010 New Revision: 896539 URL: http://svn.apache.org/viewvc?rev=896539&view=rev Log: NUTCH-655 : Injecting Crawl metadata (jnioche) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=896539&r1=896538&r2=896539&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Jan 6 17:01:51 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-655 Injecting Crawl metadata (jnioche) + * NUTCH-658 Use counters to report fetching and parsing status (jnioche) * NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=896539&r1=896538&r2=896539&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Jan 6 17:01:51 2010 @@ -37,10 +37,21 @@ import org.apache.nutch.util.NutchJob; /** This class takes a flat file of URLs and adds them to the of pages to be - * crawled. Useful for bootstrapping the system. */ + * crawled. Useful for bootstrapping the system. + * The URL files contain one URL per line, optionally followed by custom metadata + * separated by tabs with the metadata key separated from the corresponding value by '='. + * Note that some metadata keys are reserved : + * - nutch.score : allows to set a custom score for a specific URL + * - nutch.fetchInterval : allows to set a custom fetch interval for a specific URL + * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source + **/ public class Injector extends Configured implements Tool { public static final Log LOG = LogFactory.getLog(Injector.class); - + + /** metadata key reserved for setting a custom score for a specific URL */ + public static String nutchScoreMDName = "nutch.score"; + /** metadata key reserved for setting a custom fetchInterval for a specific URL */ + public static String nutchFetchIntervalMDName = "nutch.fetchInterval"; /** Normalize and filter injected urls. */ public static class InjectMapper implements Mapper { @@ -68,6 +79,36 @@ OutputCollector output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text + // if tabs : metadata that could be stored + // must be name=value and separated by \t + float customScore = -1f; + int customInterval = interval; + Map metadata = new TreeMap(); + if (url.indexOf("\t")!=-1){ + String[] splits = url.split("\t"); + url = splits[0]; + for (int s=1;s keysIter = metadata.keySet().iterator(); +while (keysIter.hasNext()){ + String keymd = keysIter.next(); + String valuemd = metadata.get(keymd); + datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } output.collect(value, datum); }
svn commit: r896545 - /lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
Author: jnioche Date: Wed Jan 6 17:08:17 2010 New Revision: 896545 URL: http://svn.apache.org/viewvc?rev=896545&view=rev Log: NUTCH-658 : small fix + renamed status value Exception into AboveExceptionThresholdInQueue Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=896545&r1=896544&r2=896545&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Jan 6 17:08:17 2010 @@ -717,7 +717,8 @@ case ProtocolStatus.EXCEPTION: logError(fit.url, status.getMessage()); int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID()); -reporter.incrCounter("FetcherStatus", "Exceptions", killedURLs); +if (killedURLs!=0) + reporter.incrCounter("FetcherStatus", "AboveExceptionThresholdInQueue", killedURLs); /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry case ProtocolStatus.BLOCKED:
svn commit: r897180 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Author: jnioche Date: Fri Jan 8 12:01:46 2010 New Revision: 897180 URL: http://svn.apache.org/viewvc?rev=897180&view=rev Log: NUTCH-269 : OOME because no upper-bound on inlinks count (stack + jnioche) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=897180&r1=897179&r2=897180&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Jan 8 12:01:46 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count (stack + jnioche) + * NUTCH-655 Injecting Crawl metadata (jnioche) * NUTCH-658 Use counters to report fetching and parsing status (jnioche) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=897180&r1=897179&r2=897180&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Fri Jan 8 12:01:46 2010 @@ -384,6 +384,14 @@ + db.update.max.inlinks + 1 + Maximum number of inlinks to take into account when updating + a URL score in the crawlDB. Only the best scoring inlinks are kept. + + + + db.ignore.internal.links true If true, when adding new links to a page, links from Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=897180&r1=897179&r2=897180&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Fri Jan 8 12:01:46 2010 @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import java.io.IOException; // Commons Logging imports @@ -27,6 +28,7 @@ import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; +import org.apache.hadoop.util.PriorityQueue; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; @@ -37,7 +39,7 @@ private int retryMax; private CrawlDatum result = new CrawlDatum(); - private ArrayList linked = new ArrayList(); + private InlinkPriorityQueue linked = null; private ScoringFilters scfilters = null; private boolean additionsAllowed; private int maxInterval; @@ -51,6 +53,8 @@ maxInterval = job.getInt("db.fetch.interval.max", 0 ); if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY; schedule = FetchScheduleFactory.getFetchSchedule(job); +int maxLinks = job.getInt("db.update.max.inlinks", 1); +linked = new InlinkPriorityQueue(maxLinks); } public void close() {} @@ -111,7 +115,7 @@ } else { link = datum; } -linked.add(link); +linked.insert(link); break; case CrawlDatum.STATUS_SIGNATURE: signature = datum.getSignature(); @@ -120,13 +124,21 @@ LOG.warn("Unknown status, key: " + key + ", datum: " + datum); } } - + +// copy the content of the queue into a List +// in reversed order +int numLinks = linked.size(); +List linkList = new ArrayList(numLinks); +for (int i = numLinks - 1; i >= 0; i--) { + linkList.add(linked.pop()); +} + // if it doesn't already exist, skip it if (!oldSet && !additionsAllowed) return; // if there is no fetched datum, perhaps there is a link -if (!fetchSet && linked.size() > 0) { - fetch = linked.get(0); +if (!fetchSet && linkList.size() > 0) { + fetch = linkList.get(0); fetchSet = true; } @@ -260,7 +272,7 @@ } try { - scfilters.updateDbScore((Text)key, oldSet ? old : null, result, linked); + scfilters.updateDbScore((Text)key, oldSet ? old : null, result, linkList); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't update score, key=" + key + ": " + e); @@ -270,5 +282,20 @@ result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY); output.collect(key, result); } + +} +class InlinkPriorityQueue extends PriorityQueue { + + public InlinkPriorityQueue(int maxSize) { +initialize(maxSize); + } + + /** Determines t
svn commit: r897825 - in /lucene/nutch/trunk/src: java/org/apache/nutch/util/MimeUtil.java test/org/apache/nutch/protocol/TestContent.java
Author: jnioche Date: Mon Jan 11 10:13:21 2010 New Revision: 897825 URL: http://svn.apache.org/viewvc?rev=897825&view=rev Log: fix for NUTCH-767 : reverted original expected values for test + treat text/plain as a default mime-type from Tika Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=897825&r1=897824&r2=897825&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Mon Jan 11 10:13:21 2010 @@ -159,6 +159,7 @@ if (this.mimeMagic) { MimeType magicType = this.mimeTypes.getMimeType(data); if (magicType != null && !magicType.getName().equals(MimeTypes.OCTET_STREAM) + && !magicType.getName().equals(MimeTypes.PLAIN_TEXT) && type != null && !type.getName().equals(magicType.getName())) { // If magic enabled and the current mime type differs from that of the // one returned from the magic, take the magic mimeType Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=897825&r1=897824&r2=897825&view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Mon Jan 11 10:13:21 2010 @@ -63,28 +63,19 @@ "http://www.foo.com/";, "".getBytes("UTF8"), "text/html; charset=UTF-8", p, conf); -// TODO check potential Tika issue and -// revert the expected value to text/html -// see https://issues.apache.org/jira/browse/NUTCH-767 -assertEquals("text/plain", c.getContentType()); +assertEquals("text/html", c.getContentType()); c = new Content("http://www.foo.com/foo.html";, "http://www.foo.com/";, "".getBytes("UTF8"), "", p, conf); -// TODO check potential Tika issue and -// revert the expected value to text/html -// see https://issues.apache.org/jira/browse/NUTCH-767 -assertEquals("text/plain", c.getContentType()); +assertEquals("text/html", c.getContentType()); c = new Content("http://www.foo.com/foo.html";, "http://www.foo.com/";, "".getBytes("UTF8"), null, p, conf); -// TODO check potential Tika issue and -// revert the expected value to text/html -// see https://issues.apache.org/jira/browse/NUTCH-767 -assertEquals("text/plain", c.getContentType()); +assertEquals("text/html", c.getContentType()); c = new Content("http://www.foo.com/";, "http://www.foo.com/";, @@ -108,10 +99,7 @@ "http://www.foo.com/";, "".getBytes("UTF8"), "", p, conf); -// TODO check that Tika returns the right value and -// revert to the default type -// see https://issues.apache.org/jira/browse/NUTCH-767 -assertEquals("text/plain", c.getContentType()); +assertEquals(MimeTypes.OCTET_STREAM, c.getContentType()); c = new Content("http://www.foo.com/";, "http://www.foo.com/";,
svn commit: r905228 - in /lucene/nutch/trunk/lib: tika-core-0.5.jar tika-core-0.6.jar
Author: jnioche Date: Mon Feb 1 09:59:50 2010 New Revision: 905228 URL: http://svn.apache.org/viewvc?rev=905228&view=rev Log: NUTCH-781: upgrade tika to version 0.6 Added: lucene/nutch/trunk/lib/tika-core-0.6.jar (with props) Removed: lucene/nutch/trunk/lib/tika-core-0.5.jar Added: lucene/nutch/trunk/lib/tika-core-0.6.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.6.jar?rev=905228&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/tika-core-0.6.jar -- svn:mime-type = application/octet-stream
svn commit: r905229 - /lucene/nutch/trunk/CHANGES.txt
Author: jnioche Date: Mon Feb 1 10:03:07 2010 New Revision: 905229 URL: http://svn.apache.org/viewvc?rev=905229&view=rev Log: NUTCH-781: upgrade tika to version 0.6 Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=905229&r1=905228&r2=905229&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Feb 1 10:03:07 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-781 Update Tika to v0.6 (jnioche) + * NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count (stack + jnioche) * NUTCH-655 Injecting Crawl metadata (jnioche)
svn commit: r905550 [1/2] - /lucene/nutch/trunk/conf/tika-mimetypes.xml
Author: jnioche Date: Tue Feb 2 09:31:19 2010 New Revision: 905550 URL: http://svn.apache.org/viewvc?rev=905550&view=rev Log: NUTCH-781 : updated tika-mimetypes.xml Modified: lucene/nutch/trunk/conf/tika-mimetypes.xml
svn commit: r906907 - in /lucene/nutch/trunk: CHANGES.txt conf/domain-suffixes.xml
Author: jnioche Date: Fri Feb 5 11:52:57 2010 New Revision: 906907 URL: http://svn.apache.org/viewvc?rev=906907&view=rev Log: NUTCH-786 Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/domain-suffixes.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=906907&r1=906906&r2=906907&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Feb 5 11:52:57 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-786 Improvement to the list of suffix domains (jnioche) + * NUTCH-775 Enhance searcher interface (siren) * NUTCH-781 Update Tika to v0.6 (jnioche) Modified: lucene/nutch/trunk/conf/domain-suffixes.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/domain-suffixes.xml?rev=906907&r1=906906&r2=906907&view=diff == --- lucene/nutch/trunk/conf/domain-suffixes.xml (original) +++ lucene/nutch/trunk/conf/domain-suffixes.xml Fri Feb 5 11:52:57 2010 @@ -1744,6 +1744,16 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
svn commit: r910187 - /lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
Author: jnioche Date: Mon Feb 15 09:41:05 2010 New Revision: 910187 URL: http://svn.apache.org/viewvc?rev=910187&view=rev Log: NUTCH-766: small improvement to Tika parser : prioritise default Tika parser when discovering plugins matching mime-type Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=910187&r1=910186&r2=910187&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Mon Feb 15 09:41:05 2010 @@ -343,11 +343,14 @@ // NotMappedParserException for (int i=0; i 0) {
svn commit: r910454 - in /lucene/nutch/trunk/src/plugin/languageidentifier/src: java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.j
Author: jnioche Date: Tue Feb 16 10:20:22 2010 New Revision: 910454 URL: http://svn.apache.org/viewvc?rev=910454&view=rev Log: NUTCH-794 : Language Identification must use check the parse metadata for language values Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=910454&r1=910453&r2=910454&view=diff == --- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Tue Feb 16 10:20:22 2010 @@ -91,15 +91,33 @@ Parse parse = parseResult.get(content.getUrl()); +String lang = getLanguageFromMetadata(parse.getData().getParseMeta()); +if (lang != null) { + parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang); + return parseResult; +} + // Trying to find the document's language LanguageParser parser = new LanguageParser(doc); -String lang = parser.getLanguage(); +lang = parser.getLanguage(); if (lang != null) { parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang); } return parseResult; } + + // Check in the metadata whether the language has already been stored there by Tika + private static String getLanguageFromMetadata(Metadata parseMD){ +// dublin core +String lang = parseMD.get("dc.language"); +if (lang!=null) return lang; +// meta content-language +lang = parseMD.get("content-language"); +if (lang!=null) return lang; +// lang attribute +return parseMD.get("lang"); + } static class LanguageParser { Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=910454&r1=910453&r2=910454&view=diff == --- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Tue Feb 16 10:20:22 2010 @@ -40,7 +40,8 @@ "document 2 titlethis is english", "document 3 titlethis is english" }; - String metalanguages[] = { "fi", "en", "en" }; + // NUTCH-794 : temporarily replaced "fi" and "en" with null + String metalanguages[] = { null, "en", "en" }; /** * Test parsing of language identifiers from html
svn commit: r911905 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java
Author: jnioche Date: Fri Feb 19 18:49:49 2010 New Revision: 911905 URL: http://svn.apache.org/viewvc?rev=911905&view=rev Log: NUTCH-719 fetchQueues.totalSize incorrect in Fetcher Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=911905&r1=911904&r2=911905&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Feb 19 18:49:49 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via jnioche) + * NUTCH-790 Some external javadoc links are broken (siren) * NUTCH-766 Tika parser (jnioche via mattmann) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=911905&r1=911904&r2=911905&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Feb 19 18:49:49 2010 @@ -338,7 +338,7 @@ if (it != null) addFetchItem(it); } -public void addFetchItem(FetchItem it) { +public synchronized void addFetchItem(FetchItem it) { FetchItemQueue fiq = getFetchItemQueue(it.queueID); fiq.addFetchItem(it); totalSize.incrementAndGet();
svn commit: r917557 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/parse/HtmlParseFilters.java
Author: jnioche Date: Mon Mar 1 15:08:05 2010 New Revision: 917557 URL: http://svn.apache.org/viewvc?rev=917557&view=rev Log: NUTCH-782: Ability to order htmlparsefilters Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=917557&r1=917556&r2=917557&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 1 15:08:05 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-782 Ability to order htmlparsefilters (jnioche) + * NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via jnioche) * NUTCH-790 Some external javadoc links are broken (siren) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=917557&r1=917556&r2=917557&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 1 15:08:05 2010 @@ -996,6 +996,18 @@ for most people would be "img,script,link". + + htmlparsefilter.order + + The order by which HTMLParse filters are applied. + If empty, all available HTMLParse filters (as dictated by properties + plugin-includes and plugin-excludes above) are loaded and applied in system + defined order. If not empty, only named filters are loaded and applied + in given order. + HTMLParse filter ordering MAY have an impact + on end result, as some filters could rely on the metadata generated by a previous filter. + + Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=917557&r1=917556&r2=917557&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Mon Mar 1 15:08:05 2010 @@ -17,6 +17,7 @@ package org.apache.nutch.parse; +import java.util.ArrayList; import java.util.HashMap; import org.apache.nutch.protocol.Content; @@ -30,12 +31,23 @@ public class HtmlParseFilters { private HtmlParseFilter[] htmlParseFilters; + + public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order"; public HtmlParseFilters(Configuration conf) { +String order = conf.get(HTMLPARSEFILTER_ORDER); ObjectCache objectCache = ObjectCache.get(conf); this.htmlParseFilters = (HtmlParseFilter[]) objectCache.getObject(HtmlParseFilter.class.getName()); if (htmlParseFilters == null) { -HashMap filters = + /* + * If ordered filters are required, prepare array of filters based on + * property + */ + String[] orderedFilters = null; + if (order != null && !order.trim().equals("")) { +orderedFilters = order.split("\\s+"); + } +HashMap filterMap = new HashMap(); try { ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.X_POINT_ID); @@ -45,12 +57,31 @@ for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; HtmlParseFilter parseFilter = (HtmlParseFilter) extension.getExtensionInstance(); -if (!filters.containsKey(parseFilter.getClass().getName())) { -filters.put(parseFilter.getClass().getName(), parseFilter); +if (!filterMap.containsKey(parseFilter.getClass().getName())) { +filterMap.put(parseFilter.getClass().getName(), parseFilter); } } -HtmlParseFilter[] htmlParseFilters = filters.values().toArray(new HtmlParseFilter[filters.size()]); -objectCache.setObject(HtmlParseFilter.class.getName(), htmlParseFilters); +HtmlParseFilter[] htmlParseFilters = filterMap.values().toArray(new HtmlParseFilter[filterMap.size()]); +/* + * If no ordered filters required, just get the filters in an + * indeterminate order + */ +if (orderedFilters == null) { + objectCache.setObject(HtmlParseFilter.class.getName(), htmlParseFilters); +} +/* Otherwise run the filters in the requ
svn commit: r919358 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/solr/SolrIndexer.java src/java/org/apache/nutch/indexer/solr/SolrWriter.java
Author: jnioche Date: Fri Mar 5 10:09:08 2010 New Revision: 919358 URL: http://svn.apache.org/viewvc?rev=919358&view=rev Log: NUTCH-799 SOLRIndexer to commit once all reducers have finished Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=919358&r1=919357&r2=919358&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Mar 5 10:09:08 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche) + * NUTCH-782 Ability to order htmlparsefilters (jnioche) * NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via jnioche) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=919358&r1=919357&r2=919358&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java Fri Mar 5 10:09:08 2010 @@ -37,6 +37,8 @@ import org.apache.nutch.indexer.NutchIndexWriterFactory; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; +import org.apache.solr.client.solrj.SolrServer; +import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer; public class SolrIndexer extends Configured implements Tool { @@ -71,6 +73,12 @@ FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); + // do the commits once and for all the reducers in one go + SolrServer solr = new CommonsHttpSolrServer(solrUrl); + solr.commit(); +} +catch (Exception e){ + LOG.error(e); } finally { FileSystem.get(job).delete(tmp, true); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=919358&r1=919357&r2=919358&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Fri Mar 5 10:09:08 2010 @@ -74,7 +74,7 @@ solr.add(inputDocs); inputDocs.clear(); } - solr.commit(); + // solr.commit(); } catch (final SolrServerException e) { throw makeIOException(e); }
svn commit: r921831 - in /lucene/nutch/trunk: ./ lib/
Author: jnioche Date: Thu Mar 11 13:06:12 2010 New Revision: 921831 URL: http://svn.apache.org/viewvc?rev=921831&view=rev Log: NUTCH-798 : Upgrade to SOLR1.4 and its dependencies Added: lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar (with props) lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar (with props) lucene/nutch/trunk/lib/commons-httpclient-3.1.jar (with props) lucene/nutch/trunk/lib/commons-io-1.4.jar (with props) lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar (with props) lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar (with props) lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar (with props) lucene/nutch/trunk/lib/wstx-asl-3.2.7.jar (with props) Removed: lucene/nutch/trunk/lib/apache-solr-common-1.3.0.jar lucene/nutch/trunk/lib/apache-solr-solrj-1.3.0.jar lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar lucene/nutch/trunk/lib/slf4j-api-1.4.3.jar Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=921831&r1=921830&r2=921831&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Mar 11 13:06:12 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-798 Upgrade to SOLR1.4 and its dependencies (jnioche) + * NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche) * NUTCH-782 Ability to order htmlparsefilters (jnioche) Added: lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar?rev=921831&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar?rev=921831&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/commons-httpclient-3.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-httpclient-3.1.jar?rev=921831&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/commons-httpclient-3.1.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/commons-io-1.4.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-io-1.4.jar?rev=921831&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/commons-io-1.4.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar?rev=921831&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar?rev=921831&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar?rev=921831&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/wstx-asl-3.2.7.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk
svn commit: r921840 - in /lucene/nutch/trunk: CHANGES.txt conf/parse-plugins.xml src/plugin/build.xml src/plugin/parse-mp3/ src/plugin/parse-rtf/
Author: jnioche Date: Thu Mar 11 13:25:44 2010 New Revision: 921840 URL: http://svn.apache.org/viewvc?rev=921840&view=rev Log: NUTCH-801 Remove RTF and MP3 parse plugins Removed: lucene/nutch/trunk/src/plugin/parse-mp3/ lucene/nutch/trunk/src/plugin/parse-rtf/ Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/parse-plugins.xml lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=921840&r1=921839&r2=921840&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Mar 11 13:25:44 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-801 Remove RTF and MP3 parse plugins (jnioche) + * NUTCH-798 Upgrade to SOLR1.4 and its dependencies (jnioche) * NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche) Modified: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/parse-plugins.xml?rev=921840&r1=921839&r2=921840&view=diff == --- lucene/nutch/trunk/conf/parse-plugins.xml (original) +++ lucene/nutch/trunk/conf/parse-plugins.xml Thu Mar 11 13:25:44 2010 @@ -124,13 +124,11 @@ - - + - - + @@ -198,8 +196,6 @@ - - - + http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=921840&r1=921839&r2=921840&view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Thu Mar 11 13:25:44 2010 @@ -52,14 +52,12 @@ - -
svn commit: r926003 - in /lucene/nutch/trunk: ./ conf/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/pro
Author: jnioche Date: Mon Mar 22 09:00:11 2010 New Revision: 926003 URL: http://svn.apache.org/viewvc?rev=926003&view=rev Log: NUTCH-740 Configuration option to override default language for fetched pages Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=926003&r1=926002&r2=926003&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 22 09:00:11 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-740 Configuration option to override default language for fetched pages (Marcin Okraszewski via jnioche) + * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab) * NUTCH-787 Upgrade Lucene to 3.0.1. (Dawid Weiss via ab) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=926003&r1=926002&r2=926003&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 22 09:00:11 2010 @@ -228,6 +228,15 @@ + + http.accept.language + en-us,en-gb,en;q=0.7,*;q=0.3 + Value of the "Accept-Language" request header field. + This allows selecting non-English language as default one to retrieve. + It is a useful setting for search engines build for certain national group. + + + Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=926003&r1=926002&r2=926003&view=diff == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Mar 22 09:00:11 2010 @@ -93,6 +93,8 @@ public abstract class HttpBase implement "http://lucene.apache.org/nutch/bot.html";, "nutch-ag...@lucene.apache.org"); + /** The "Accept-Language" request header value. */ + protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3"; /** * Maps from host to a Long naming the time it should be unblocked. @@ -162,6 +164,7 @@ public abstract class HttpBase implement this.maxThreadsPerHost = conf.getInt("fetcher.threads.per.host", 1); this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email")); +this.acceptLanguage = conf.get("http.accept.language", acceptLanguage); this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000); this.maxCrawlDelay = (long)(conf.getInt("fetcher.max.crawl.delay", -1) * 1000); // backward-compatible default setting @@ -326,6 +329,13 @@ public abstract class HttpBase implement return userAgent; } + /** Value of "Accept-Language" request header sent by Nutch. + * @return The value of the header "Accept-Language" header. + */ + public String getAcceptLanguage() { + return acceptLanguage; + } + public boolean getUseHttp11() { return useHttp11; } @@ -470,6 +480,7 @@ public abstract class HttpBase implement logger.info("http.timeout = " + timeout); logger.info("http.content.limit = " + maxContent); logger.info("http.agent = " + userAgent); + logger.info("http.accept.language = " + acceptLanguage); logger.info(Protocol.CHECK_BLOCKING + " = " + checkBlocking); logger.info(Protocol.CHECK_ROBOTS + " = " + checkRobots); if (checkBlocking) { Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=926003&r1=926002&r2=926003&view=diff =
svn commit: r926155 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/tools/ src/test/org/apache/nutch/crawl/ src/test/org/ap
Author: jnioche Date: Mon Mar 22 16:19:12 2010 New Revision: 926155 URL: http://svn.apache.org/viewvc?rev=926155&view=rev Log: NUTCH-762 : Generator can generate several segments in one parse of the crawlDB Added: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java Removed: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=926155&r1=926154&r2=926155&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 22 16:19:12 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-762 Generator can generate several segments in one parse of the crawlDB (jnioche) + * NUTCH-740 Configuration option to override default language for fetched pages (Marcin Okraszewski via jnioche) * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=926155&r1=926154&r2=926155&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 22 16:19:12 2010 @@ -514,24 +514,21 @@ - generate.max.per.host + generate.max.count -1 - The maximum number of urls per host in a single - fetchlist. -1 if unlimited. + The maximum number of urls in a single + fetchlist. -1 if unlimited. The urls are counted according + to the value of the parameter generator.count.mode. + - generate.max.per.host.by.ip - false - If false, same host names are counted. If true, - hosts' IP addresses are resolved and the same IP-s are counted. - - -+-+-+- WARNING !!! -+-+-+- - When set to true, Generator will create a lot of DNS lookup - requests, rapidly. This may cause a DOS attack on - remote DNS servers, not to mention increased external traffic - and latency. For these reasons when using this option it is - required that a local caching DNS be used. + generate.count.mode + host + Determines how the URLs are counted for generator.max.count. + Default value is 'host' but can be 'domain'. Note that we do not count + per IP in the new version of the Generator. + @@ -545,6 +542,34 @@ updatedb will generate identical fetchlists. + + generate.max.per.host + -1 + (Deprecated). Use generate.max.count and generate.count.mode instead. + The maximum number of urls per host in a single + fetchlist. -1 if unlimited. + + + + + partition.url.mode + byHost + Determines how to partition URLs. Default value is 'byHost', + also takes 'byDomain' or 'byIP'. + + + + + crawl.gen.delay + 60480 + + This value, expressed in days, defines how long we should keep the lock on records + in CrawlDb that were just selected for fetching. If these records are not updated + in the meantime, the lock is canceled, i.e. the become eligible for selecting. + Default value of this is 7 days. + + + Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=926155&r1=926154&r2=926155&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Mon Mar 22 16:19:12 2010 @@ -124,17 +124,17 @@ public class Crawl { injector.inject(crawlDb, rootUrlDir); int i; for (i = 0; i < depth; i++) { // generate new segment - Path segment = generator.generate(crawlDb, segments, -1, topN, System + Path[] segs = generator.generate(crawlDb, segments, -1, topN, System .currentTimeMillis()); - if (segment == null) { + if (segments == null) { LOG.info("Stopping at depth=" + i + " - no more URLs to fetch."); break; } - fetcher.fetch(segment, threads, org.apache.nutch.fetcher.Fetcher.isParsing(conf)); // fetch it + fetcher.fetch(segs[0], threads, org.apache.nutch.fe
svn commit: r926163 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Author: jnioche Date: Mon Mar 22 16:29:30 2010 New Revision: 926163 URL: http://svn.apache.org/viewvc?rev=926163&view=rev Log: fixed NPE introduced in NUTCH-762 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=926163&r1=926162&r2=926163&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon Mar 22 16:29:30 2010 @@ -480,7 +480,7 @@ public class Generator extends Configure LOG.info("Generator: topN: " + topN); } -if (getConf().get(GENERATE_MAX_PER_HOST_BY_IP).equals("true")){ +if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))){ LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); }
svn commit: r928746 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/tools/CrawlDBScanner.java
Author: jnioche Date: Mon Mar 29 12:12:09 2010 New Revision: 928746 URL: http://svn.apache.org/viewvc?rev=928746&view=rev Log: NUTCH-784 : CrawlDBScanner Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=928746&r1=928745&r2=928746&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 29 12:12:09 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-784 CrawlDBScanner (jnioche) + * NUTCH-762 Generator can generate several segments in one parse of the crawlDB (jnioche) * NUTCH-740 Configuration option to override default language for fetched pages (Marcin Okraszewski via jnioche) Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java?rev=928746&view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java Mon Mar 29 12:12:09 2010 @@ -0,0 +1,165 @@ +package org.apache.nutch.tools; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapFileOutputFormat; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.CrawlDb; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +/** + * Dumps all the entries matching a regular expression on their URL. Generates a + * text representation of the CrawlDatum-s or binary objects which can then be + * used as a new CrawlDB. The dump mechanism of the crawldb reader is not very + * useful on large crawldbs as the ouput can be extremely large and the -url + * function can't help if we don't know what url we want to have a look at. + * + * @author : Julien Nioche + */ + +public class CrawlDBScanner extends Configured implements Tool, +Mapper, Reducer { + + public static final Log LOG = LogFactory.getLog(CrawlDBScanner.class); + + public CrawlDBScanner() {} + + public CrawlDBScanner(Configuration conf) { +setConf(conf); + } + + public void close() {} + + private String regex = null; + private String status = null; + + public void configure(JobConf job) { +regex = job.get("CrawlDBScanner.regex"); +status = job.get("CrawlDBScanner.status"); + } + + public void map(Text url, CrawlDatum crawlDatum, + OutputCollector output, Reporter reporter) throws IOException { + +// check status +if (status != null +&& !status.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus( return; + +// if URL matched regexp dump it +if (url.toString().matches(regex)) { + output.collect(url, crawlDatum); +} + } + + public void reduce(Text key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { +while (values.hasNext()) { + CrawlDatum val = values.next(); + output.collect(key, val); +} + } + + private void scan(Path crawlDb, Path outputPath, String regex, String status, + boolean text) throws IOException { + +JobConf job = new NutchJob(getConf()); + +job.setJobName("Scan : " + crawlDb + " for URLS matching : " + regex); + +job.set("CrawlDBScanner.regex", regex); +if (status != null) job.set("CrawlDBScanner.status", status); + +FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); +job.setInputFormat(SequenceFileInputFormat.class); + +job.setMapperClass(CrawlDBScanner.class); +job.setReducerClass(CrawlDBScanner.class); + +FileOutputFormat.setOutputPath(job, outputPath); + +// if we want a text dump of the entries +// in or
svn commit: r929038 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/
Author: jnioche Date: Tue Mar 30 08:30:28 2010 New Revision: 929038 URL: http://svn.apache.org/viewvc?rev=929038&view=rev Log: NUTCH-779 Mechanism for passing metadata from parse to crawldb Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=929038&r1=929037&r2=929038&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Mar 30 08:30:28 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche) + * NUTCH-784 CrawlDBScanner (jnioche) * NUTCH-762 Generator can generate several segments in one parse of the crawlDB (jnioche) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=929038&r1=929037&r2=929038&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Tue Mar 30 08:30:28 2010 @@ -479,6 +479,15 @@ + + db.parsemeta.to.crawldb + + Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779). + Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' + will copy both the key 'lang' and its value to the corresponding entry in the crawldb. + + + db.fetch.retry.max 3 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=929038&r1=929037&r2=929038&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Tue Mar 30 08:30:28 2010 @@ -82,6 +82,8 @@ public class CrawlDatum implements Writa public static final byte STATUS_INJECTED = 0x42; /** Page discovered through a link. */ public static final byte STATUS_LINKED= 0x43; + /** Page got metadata from a parser */ + public static final byte STATUS_PARSE_META= 0x44; public static final HashMap statNames = new HashMap(); @@ -101,6 +103,7 @@ public class CrawlDatum implements Writa statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm"); statNames.put(STATUS_FETCH_GONE, "fetch_gone"); statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified"); +statNames.put(STATUS_PARSE_META, "parse_metadata"); oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED); oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=929038&r1=929037&r2=929038&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Tue Mar 30 08:30:28 2010 @@ -20,6 +20,7 @@ package org.apache.nutch.crawl; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Map.Entry; import java.io.IOException; // Commons Logging imports @@ -71,7 +72,8 @@ public class CrawlDbReducer implements R byte[] signature = null; boolean multiple = false; // avoid deep copy when only single value exists linked.clear(); - +org.apache.hadoop.io.MapWritable metaFromParse = null; + while (values.hasNext()) { CrawlDatum datum = (CrawlDatum)values.next(); if (!multiple && values.hasNext()) multiple = true; @@ -120,6 +122,9 @@ public class CrawlDbReducer implements R case CrawlDatum.STATUS_SIGNATURE: signature = datum.getSignature(); break; + case CrawlDatum.STATUS_PARSE_META: +metaFromParse = datum.getMetaData(); +break; default: LOG.warn("Unknown status, key: " + key + ", datum: " + datum); } @@ -233,6 +238,11 @@ public class CrawlDbReducer implements R else result.setStatus(CrawlDat
svn commit: r929039 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java
Author: jnioche Date: Tue Mar 30 08:35:49 2010 New Revision: 929039 URL: http://svn.apache.org/viewvc?rev=929039&view=rev Log: NUTCH 785 : Fetcher : copy metadata from origin URL when redirecting + call scfilters.initialScore on newly created URL Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=929039&r1=929038&r2=929039&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Mar 30 08:35:49 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call scfilters.initialScore on newly created URL (jnioche) + * NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche) * NUTCH-784 CrawlDBScanner (jnioche) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=929039&r1=929038&r2=929039&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Mar 30 08:35:49 2010 @@ -46,6 +46,7 @@ import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.*; import org.apache.nutch.protocol.*; import org.apache.nutch.parse.*; +import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.*; @@ -656,6 +657,9 @@ public class Fetcher extends Configured if (redirUrl != null) { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore()); +// transfer existing metadata to the redir +newDatum.getMetaData().putAll(fit.datum.getMetaData()); +scfilters.initialScore(redirUrl, newDatum); if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); @@ -694,6 +698,9 @@ public class Fetcher extends Configured if (redirUrl != null) { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore()); + // transfer existing metadata + newDatum.getMetaData().putAll(fit.datum.getMetaData()); + scfilters.initialScore(redirUrl, newDatum); if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); @@ -809,6 +816,13 @@ public class Fetcher extends Configured } else { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval()); + // transfer existing metadata + newDatum.getMetaData().putAll(datum.getMetaData()); + try { +scfilters.initialScore(url, newDatum); + } catch (ScoringFilterException e) { +e.printStackTrace(); + } if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
svn commit: r931098 - in /lucene/nutch/trunk: ./ conf/ lib/ src/plugin/ src/plugin/parse-tika/ src/plugin/parse-tika/lib/ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/
Author: jnioche Date: Tue Apr 6 11:38:26 2010 New Revision: 931098 URL: http://svn.apache.org/viewvc?rev=931098&view=rev Log: NUTCH-810 Upgraded to Tika 0.7 Added: lucene/nutch/trunk/lib/tika-core-0.7.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar (with props) Removed: lucene/nutch/trunk/lib/tika-core-0.6.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-0.8.0-incubator.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-0.8.0-incubator.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-0.8.0-incubating.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.6.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/tika-mimetypes.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=931098&r1=931097&r2=931098&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Apr 6 11:38:26 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-810 Upgrade to Tika 0.7 (jnioche) + * NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call scfilters.initialScore on newly created URL (jnioche) * NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche) Modified: lucene/nutch/trunk/conf/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/tika-mimetypes.xml?rev=931098&r1=931097&r2=931098&view=diff == --- lucene/nutch/trunk/conf/tika-mimetypes.xml (original) +++ lucene/nutch/trunk/conf/tika-mimetypes.xml Tue Apr 6 11:38:26 2010 @@ -2198,7 +2198,11 @@ - + + + + + @@ -3551,7 +3555,13 @@ bad HTML, unfortunately. --> + + + + + + Added: lucene/nutch/trunk/lib/tika-core-0.7.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.7.jar?rev=931098&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/tika-core-0.7.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=931098&r1=931097&r2=931098&view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Tue Apr 6 11:38:26 2010 @@ -32,8 +32,8 @@ - - + + @@ -65,12 +65,12 @@ - + - + @@ -99,7 +99,6 @@ - @@ -107,7 +106,6 @@ - @@ -172,11 +170,11 @@ - + - + Modified: lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=931098&r1=931097&r2=931098&view=diff == --- lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Apr 6 11:38:26 2010 @@ -1,7 +1,7 @@ - + Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar?rev=931098&view=auto == Binary file - no diff available. Propchange: luc