svn commit: r894716 - in /lucene/nutch/trunk: site/credits.html site/credits.pdf src/site/src/documentation/content/xdocs/credits.xml

2009-12-30 Thread jnioche
Author: jnioche
Date: Wed Dec 30 21:34:28 2009
New Revision: 894716

URL: http://svn.apache.org/viewvc?rev=894716&view=rev
Log:
Adding J. Nioche to the list of committers

Modified:
lucene/nutch/trunk/site/credits.html
lucene/nutch/trunk/site/credits.pdf
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/credits.xml

Modified: lucene/nutch/trunk/site/credits.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.html?rev=894716&r1=894715&r2=894716&view=diff
==
--- lucene/nutch/trunk/site/credits.html (original)
+++ lucene/nutch/trunk/site/credits.html Wed Dec 30 21:34:28 2009
@@ -252,6 +252,10 @@
 
   
 
+http://www.digitalpebble.com/";>Julien Nioche
+
+  
+
 http://people.apache.org/~siren";>Sami Siren
 
   
@@ -261,7 +265,7 @@
 
 
 
-
+
 Friends
 
 
@@ -292,7 +296,7 @@
 
 
 
-
+
 Sponsors
 
 

Modified: lucene/nutch/trunk/site/credits.pdf
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.pdf?rev=894716&r1=894715&r2=894716&view=diff
==
--- lucene/nutch/trunk/site/credits.pdf (original)
+++ lucene/nutch/trunk/site/credits.pdf Wed Dec 30 21:34:28 2009
@@ -58,10 +58,10 @@
 >>
 endobj
 14 0 obj
-<< /Length 2451 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 2523 /Filter [ /ASCII85Decode /FlateDecode ]
  >>
 stream
-Gat=-?#uMo'RekG6)QN,j<4X4IDDVKe8Cd99l3ZjVX=Q[b.BLBhuXiFrUmi*,VH<1...@__ei(`FER#:35&J-3KkLmIc0$E/-9at+C5'JL_g:M-`ZXIRr>cMV(?8=o8Z43rXNS`.lKsK^`(anX=FV;m0$Sh;[&*WTGKJTabq4PB&nG%HT1<]HqfD`^fK]+DM"GC][A;AauDhV\=BF-6%+--+,-R<^Q`J-rq1^/aI!E7A77`*g/j,2T[+;*_3p]F4O:C]]NLjH]*W>a3p$E[tVeq&1,Q9`X>&AoU`^21U3M3(8Y,Y"8+d:rLd/?E>rprgkLDtPtOH9;iBRt,/sV1!"^kLNh#W?Xc;Z=^0N*TO_9#QGEfj)K%-/N_]VP#g)tHj?tS>b1CVc1X8#u7>Ig^0>aenmP)fY)!'l;.LN!;sR5V8_qK>q?BssHf[1`%[p=)^/[j['-n/*i...@j8c=+o1v+oasfe;<&/X[k+:G!Fa`7,=6%UB(je=+0Jk]pJuk0$Ful&w0jqci.wtfp?...@jip'*(4IN(nFWUC'4$+ASc0$'Hb0).HOX.nYPOI'ZfjA4lGoAq.OB7pr*@;,&dS]W^Y".".TVojXN,X0'^8csme]rov...@mrm=&7Ume#n129mnp^<[\n!"AiTa(oU;!=fZ7KED][iPuL2sG4B2QRR;R8h]pal\kC5TJ`i!B1dgG;M/ZcccB^2R[RqCf=sJoXH_\a#TIr0[=g'YE<]#JSS-7VY96Wr.ok]i=r\uB18i+V7Ss'SGHVU[Qb5/Q5TWMK3h(]eqes3?1^...@ad(So.oH1',$NU^Vd0*s7V-7,/:Z[3MHCgmf...@aa$iphpn0^:5L1ee7)3\(au1D0)OB\]4nLeh+trE:]>o[ep'HMp0S"6f*#u`m_=1)$r+/?0fr%8)ZF4M9-8$qiO8pl&$RTRX&;'t5i!`R=2hX%*)iE^Beb>'hN'B_]>]Q9K$mCB;OKPjEB#qBR8FHam6Gc+o6UEhb-kk2j:)X;T.C08B"Mf2l]$XQn0D%X<)b'213CgRMXW+^tAD'KI>J5,Sk/mc7!*...@$xk*%*p=z.(>r-Vcj#&EO>rcOd"WY'T"@sLig2Y'taK-,a-qM[rFHonKAtmbo1ejl28\D7UmR'edeKrfRJNN6G'FB5XBB=$3rGK9B(0DX,]nr`d...@`-q)"pJ?'%5m-cho3$21...@l!4?0y>ck3^lHrB4+UOCPg#tMPmHhVcMDK0f<-5e+uxw]j#...@qs-%sbew.jf15ga\^<`,1S+C=C-df;1RW])_L(N8X,+O2I;PjoBEdq"j[4Bb`o8s'nn'=NZ0ZN-f]T05Hk/tG(2[H&)qDr>NDb0!qqU^G<'Zp4:7+dW)^0sm#,\V;7V0cjr1p)f%/qYmj-Z?$AUlBL1%XlNB'tbd)?+Q^H,QmfH2%'N%_oPR?e$KQgk$dJLFe)cW2K6.Q%P8Zo1k"RAhX>59n?j...@mez0l[inp$=f$knplk[)_>J=psS7?oce?FP[7&Pe_$&g8ck",p<+RDZPcD%]pJJI>,k...@j3h?p5n:o%&3krIkL#pRsD#!?h]F0FGL\q2oJ"5:7u3j.f.!$/5=L,M8k5bmiXYO9t>D%($\s6U-#QdabpH>@gbM8BONu7"[aH:N\^fLQb6,C?YISJ#LRlDar#mfYU4Fk*_:IRVLV?eD)?r#-0;hW[7M/pHR5ZLdq~>
+Gat=.gN&fR&:NH>+I/skrF!LtiR*u_M./K*UroO.4S$1Q#_\,o.Cdrh[?:`3(CDk$A_c#$'6^EnKoo'=DV"P[cj(1`Z!/GLqMlI'_P>Z=j'(/nk...@fmq3\jo#r1]&S]l-i",?Ph-``Gi-s;WK:i$e4Z``]21SYbbc$nE5rp]<=[@(H\rE*[q"&O7#Ynt4#%'?4'qE"87L)qbI0Jpm,!pItf#<5'$l$ec7b_j&,l[#2co1DBg(Z_cUknM!=eoou>&HLfcsp#HaX$g%+;k.b)A5`W!&ateTp>Ht/+0leNc/VuN[:mh^84EM8?!]/Z6e'(ch"95enOV7h.L'?p:(esRj9'XYQ4`BF1r1#H<(tG6F0S8$DNcU*5f)&Y$
 
fZ/@DdNgr>_D,g...@iup?phkyk5$kx!m0_fthr=hu;NN&]i8ld]!...@w6z>K7VWl)3...@j#c-hr;/KM[rdlSS&mT)'0cICU8Vcj.Z34NK=H_.m[?V#TJAdWN5eO'^tCpoT[8uf:>*8=Tp(GA]lQV]eR7Fi5"8'j3QEXYqBXrAWRK:p%nl-MR`5[BNc071,lDD.#t<+lT3s$D,)q44Do57+bTjq6RhRmfKMfp3f5<9e_iL;B<3U2$i=aO@@F0R_HkVG4]HK'Pf&WA`di`57QZI+)4...@n,#0#!]0&'oZ\sqoj?(/4NSg6MNC%ic)NLOh5$fQ9_Xf3Q?D37nT]ukQZ'fEd;8bYhZFL6Cc=Ac:6:'.S6kGu^5[tF9\IIri=cO7?r=mMb49P+#BZ^mrkc9B*B0)oZnYHkiG*P=[mO]lN+&&NISUnlgCh8l*/ZRCj`TCTDfi]S4Q8JEbKVB&%n+U$9p:*q*"aCnLBmVaka&#DC74u0o_;2rV,BoU0_m-Fu`+QVuUtbfUk/`K::_m[*:=LC=8j<%_%TZs4ecoCc1(kBo^O[B#n[DE"U&@a1="7#s5doSj<5,Ed*.i:YC+U#;?2=t2\ql-tZFnM?9&fQ?P3\jDgL*-R(N)X<3B.2?.Y%$:Z&<9p...@a3)OGT/vmtdajse...@2/%Fjf(DDn14:Vq6WNIQ4hnfCH9''p!^_aPj*4Jgo3uq6:9B7U*W;or

svn commit: r895972 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java src/java/org/apache/nutch/parse/ParseSegment.java src/java/org/apache/nutch/protocol/ProtocolSt

2010-01-05 Thread jnioche
Author: jnioche
Date: Tue Jan  5 10:14:49 2010
New Revision: 895972

URL: http://svn.apache.org/viewvc?rev=895972&view=rev
Log:
NUTCH-658 : Add Counter for # of doc fetched in Reporter

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=895972&r1=895971&r2=895972&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Jan  5 10:14:49 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-658 Use counters to report fetching and parsing status (jnioche)
+
 * NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann)
 
 * NUTCH-767 Update Tika to v0.5 for the MimeType detection (Julien Nioche via 
ab)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=895972&r1=895971&r2=895972&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Jan  
5 10:14:49 2010
@@ -607,6 +607,7 @@
   LOG.debug("Denied by robots.txt: " + fit.url);
 }
 output(fit.url, fit.datum, null, 
ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
+reporter.incrCounter("FetcherStatus", "robots_denied", 1);
 continue;
   }
   if (rules.getCrawlDelay() > 0) {
@@ -615,6 +616,7 @@
   fetchQueues.finishFetchItem(fit, true);
   LOG.debug("Crawl-Delay for " + fit.url + " too long (" + 
rules.getCrawlDelay() + "), skipping");
   output(fit.url, fit.datum, null, 
ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
+  reporter.incrCounter("FetcherStatus", 
"robots_denied_maxcrawldelay", 1);
   continue;
 } else {
   FetchItemQueue fiq = 
fetchQueues.getFetchItemQueue(fit.queueID);
@@ -630,6 +632,8 @@
 
   String urlString = fit.url.toString();
 
+  reporter.incrCounter("FetcherStatus", status.getName(), 1);
+  
   switch(status.getCode()) {
 
   case ProtocolStatus.WOULDBLOCK:
@@ -664,6 +668,7 @@
 } else {
   // stop redirecting
   redirecting = false;
+  reporter.incrCounter("FetcherStatus", 
"FetchItem.notCreated.redirect", 1);
 }
   }
 }
@@ -701,6 +706,7 @@
   } else {
 // stop redirecting
 redirecting = false;
+reporter.incrCounter("FetcherStatus", 
"FetchItem.notCreated.redirect", 1);
   }
 } else {
   // stop redirecting
@@ -926,6 +932,7 @@
   if (parseResult != null && !parseResult.isEmpty()) {
 Parse p = parseResult.get(content.getUrl());
 if (p != null) {
+  reporter.incrCounter("ParserStatus", 
ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1);
   return p.getData().getStatus();
 }
   }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=895972&r1=895971&r2=895972&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue 
Jan  5 10:14:49 2010
@@ -93,6 +93,8 @@
   Parse parse = entry.getValue();
   ParseStatus parseStatus = parse.getData().getStatus();
   
+  reporter.incrCounter("ParserStatus", 
ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);
+  
   if (!parseStatus.isSuccess()) {
 LOG.warn("Error parsing: " + key + ": " + parseStatus);
 parse = parseStatus.getEmptyParse(getConf());

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java

svn commit: r896539 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Injector.java

2010-01-06 Thread jnioche
Author: jnioche
Date: Wed Jan  6 17:01:51 2010
New Revision: 896539

URL: http://svn.apache.org/viewvc?rev=896539&view=rev
Log:
NUTCH-655 : Injecting Crawl metadata (jnioche)

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=896539&r1=896538&r2=896539&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jan  6 17:01:51 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-655 Injecting Crawl metadata (jnioche)
+
 * NUTCH-658 Use counters to report fetching and parsing status (jnioche)
 
 * NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=896539&r1=896538&r2=896539&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Jan  6 
17:01:51 2010
@@ -37,10 +37,21 @@
 import org.apache.nutch.util.NutchJob;
 
 /** This class takes a flat file of URLs and adds them to the of pages to be
- * crawled.  Useful for bootstrapping the system. */
+ * crawled.  Useful for bootstrapping the system. 
+ * The URL files contain one URL per line, optionally followed by custom 
metadata 
+ * separated by tabs with the metadata key separated from the corresponding 
value by '='. 
+ * Note that some metadata keys are reserved : 
+ * - nutch.score : allows to set a custom score for a specific URL 
+ * - nutch.fetchInterval : allows to set a custom fetch interval for a 
specific URL 
+ * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 
\t userType=open_source
+ **/
 public class Injector extends Configured implements Tool {
   public static final Log LOG = LogFactory.getLog(Injector.class);
-
+  
+  /** metadata key reserved for setting a custom score for a specific URL */
+  public static String nutchScoreMDName = "nutch.score";
+  /** metadata key reserved for setting a custom fetchInterval for a specific 
URL */
+  public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
 
   /** Normalize and filter injected urls. */
   public static class InjectMapper implements Mapper {
@@ -68,6 +79,36 @@
 OutputCollector output, Reporter 
reporter)
   throws IOException {
   String url = value.toString();  // value is line of text
+  // if tabs : metadata that could be stored
+  // must be name=value and separated by \t
+  float customScore = -1f;
+  int customInterval = interval;
+  Map metadata = new TreeMap();
+  if (url.indexOf("\t")!=-1){
+ String[] splits = url.split("\t");
+ url = splits[0];
+ for (int s=1;s keysIter = metadata.keySet().iterator();
+while (keysIter.hasNext()){
+   String keymd = keysIter.next();
+   String valuemd = metadata.get(keymd);
+   datum.getMetaData().put(new Text(keymd), new Text(valuemd));
 }
 output.collect(value, datum);
   }




svn commit: r896545 - /lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

2010-01-06 Thread jnioche
Author: jnioche
Date: Wed Jan  6 17:08:17 2010
New Revision: 896545

URL: http://svn.apache.org/viewvc?rev=896545&view=rev
Log:
NUTCH-658 : small fix + renamed status value Exception into 
AboveExceptionThresholdInQueue

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=896545&r1=896544&r2=896545&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Jan  
6 17:08:17 2010
@@ -717,7 +717,8 @@
   case ProtocolStatus.EXCEPTION:
 logError(fit.url, status.getMessage());
 int killedURLs = 
fetchQueues.checkExceptionThreshold(fit.getQueueID());
-reporter.incrCounter("FetcherStatus", "Exceptions", 
killedURLs);
+if (killedURLs!=0)
+   reporter.incrCounter("FetcherStatus", 
"AboveExceptionThresholdInQueue", killedURLs);
 /* FALLTHROUGH */
   case ProtocolStatus.RETRY:  // retry
   case ProtocolStatus.BLOCKED:




svn commit: r897180 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/CrawlDbReducer.java

2010-01-08 Thread jnioche
Author: jnioche
Date: Fri Jan  8 12:01:46 2010
New Revision: 897180

URL: http://svn.apache.org/viewvc?rev=897180&view=rev
Log:
NUTCH-269 : OOME because no upper-bound on inlinks count (stack + jnioche)

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=897180&r1=897179&r2=897180&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Jan  8 12:01:46 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count 
(stack + jnioche)
+
 * NUTCH-655 Injecting Crawl metadata (jnioche)
 
 * NUTCH-658 Use counters to report fetching and parsing status (jnioche)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=897180&r1=897179&r2=897180&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Fri Jan  8 12:01:46 2010
@@ -384,6 +384,14 @@
 
 
 
+  db.update.max.inlinks
+  1
+  Maximum number of inlinks to take into account when updating 
+  a URL score in the crawlDB. Only the best scoring inlinks are kept. 
+  
+
+
+
   db.ignore.internal.links
   true
   If true, when adding new links to a page, links from

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=897180&r1=897179&r2=897180&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Fri 
Jan  8 12:01:46 2010
@@ -19,6 +19,7 @@
 
 import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
 import java.io.IOException;
 
 // Commons Logging imports
@@ -27,6 +28,7 @@
 
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.PriorityQueue;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
@@ -37,7 +39,7 @@
   
   private int retryMax;
   private CrawlDatum result = new CrawlDatum();
-  private ArrayList linked = new ArrayList();
+  private InlinkPriorityQueue linked = null;
   private ScoringFilters scfilters = null;
   private boolean additionsAllowed;
   private int maxInterval;
@@ -51,6 +53,8 @@
 maxInterval = job.getInt("db.fetch.interval.max", 0 );
 if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * 
FetchSchedule.SECONDS_PER_DAY;
 schedule = FetchScheduleFactory.getFetchSchedule(job);
+int maxLinks = job.getInt("db.update.max.inlinks", 1);
+linked = new InlinkPriorityQueue(maxLinks);
   }
 
   public void close() {}
@@ -111,7 +115,7 @@
 } else {
   link = datum;
 }
-linked.add(link);
+linked.insert(link);
 break;
   case CrawlDatum.STATUS_SIGNATURE:
 signature = datum.getSignature();
@@ -120,13 +124,21 @@
 LOG.warn("Unknown status, key: " + key + ", datum: " + datum);
   }
 }
-
+
+// copy the content of the queue into a List
+// in reversed order
+int numLinks = linked.size();
+List linkList = new ArrayList(numLinks);
+for (int i = numLinks - 1; i >= 0; i--) {
+  linkList.add(linked.pop());
+}
+
 // if it doesn't already exist, skip it
 if (!oldSet && !additionsAllowed) return;
 
 // if there is no fetched datum, perhaps there is a link
-if (!fetchSet && linked.size() > 0) {
-  fetch = linked.get(0);
+if (!fetchSet && linkList.size() > 0) {
+  fetch = linkList.get(0);
   fetchSet = true;
 }
 
@@ -260,7 +272,7 @@
 }
 
 try {
-  scfilters.updateDbScore((Text)key, oldSet ? old : null, result, linked);
+  scfilters.updateDbScore((Text)key, oldSet ? old : null, result, 
linkList);
 } catch (Exception e) {
   if (LOG.isWarnEnabled()) {
 LOG.warn("Couldn't update score, key=" + key + ": " + e);
@@ -270,5 +282,20 @@
 result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
 output.collect(key, result);
   }
+  
+}
 
+class InlinkPriorityQueue extends PriorityQueue {
+  
+  public InlinkPriorityQueue(int maxSize) {
+initialize(maxSize);
+  }
+  
+  /** Determines t

svn commit: r897825 - in /lucene/nutch/trunk/src: java/org/apache/nutch/util/MimeUtil.java test/org/apache/nutch/protocol/TestContent.java

2010-01-11 Thread jnioche
Author: jnioche
Date: Mon Jan 11 10:13:21 2010
New Revision: 897825

URL: http://svn.apache.org/viewvc?rev=897825&view=rev
Log:
fix for NUTCH-767 : reverted original expected values for test + treat 
text/plain as a default mime-type from Tika

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=897825&r1=897824&r2=897825&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Mon Jan 11 
10:13:21 2010
@@ -159,6 +159,7 @@
 if (this.mimeMagic) {
   MimeType magicType = this.mimeTypes.getMimeType(data);
   if (magicType != null && 
!magicType.getName().equals(MimeTypes.OCTET_STREAM)
+  && !magicType.getName().equals(MimeTypes.PLAIN_TEXT)
   && type != null && !type.getName().equals(magicType.getName())) {
 // If magic enabled and the current mime type differs from that of the
 // one returned from the magic, take the magic mimeType

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=897825&r1=897824&r2=897825&view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Mon 
Jan 11 10:13:21 2010
@@ -63,28 +63,19 @@
 "http://www.foo.com/";,
 "".getBytes("UTF8"),
 "text/html; charset=UTF-8", p, conf);
-// TODO check potential Tika issue and 
-// revert the expected value to text/html
-// see https://issues.apache.org/jira/browse/NUTCH-767
-assertEquals("text/plain", c.getContentType());
+assertEquals("text/html", c.getContentType());
 
 c = new Content("http://www.foo.com/foo.html";,
 "http://www.foo.com/";,
 "".getBytes("UTF8"),
 "", p, conf);
-// TODO check potential Tika issue and 
-// revert the expected value to text/html
-// see https://issues.apache.org/jira/browse/NUTCH-767
-assertEquals("text/plain", c.getContentType());
+assertEquals("text/html", c.getContentType());
 
 c = new Content("http://www.foo.com/foo.html";,
 "http://www.foo.com/";,
 "".getBytes("UTF8"),
 null, p, conf);
-// TODO check potential Tika issue and 
-// revert the expected value to text/html
-// see https://issues.apache.org/jira/browse/NUTCH-767
-assertEquals("text/plain", c.getContentType());
+assertEquals("text/html", c.getContentType());
 
 c = new Content("http://www.foo.com/";,
 "http://www.foo.com/";,
@@ -108,10 +99,7 @@
 "http://www.foo.com/";,
 "".getBytes("UTF8"),
 "", p, conf);
-// TODO check that Tika returns the right value and
-// revert to the default type
-// see https://issues.apache.org/jira/browse/NUTCH-767
-assertEquals("text/plain", c.getContentType());
+assertEquals(MimeTypes.OCTET_STREAM, c.getContentType());
 
 c = new Content("http://www.foo.com/";,
 "http://www.foo.com/";,




svn commit: r905228 - in /lucene/nutch/trunk/lib: tika-core-0.5.jar tika-core-0.6.jar

2010-02-01 Thread jnioche
Author: jnioche
Date: Mon Feb  1 09:59:50 2010
New Revision: 905228

URL: http://svn.apache.org/viewvc?rev=905228&view=rev
Log:
NUTCH-781: upgrade tika to version 0.6

Added:
lucene/nutch/trunk/lib/tika-core-0.6.jar   (with props)
Removed:
lucene/nutch/trunk/lib/tika-core-0.5.jar

Added: lucene/nutch/trunk/lib/tika-core-0.6.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.6.jar?rev=905228&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/tika-core-0.6.jar
--
svn:mime-type = application/octet-stream




svn commit: r905229 - /lucene/nutch/trunk/CHANGES.txt

2010-02-01 Thread jnioche
Author: jnioche
Date: Mon Feb  1 10:03:07 2010
New Revision: 905229

URL: http://svn.apache.org/viewvc?rev=905229&view=rev
Log:
NUTCH-781: upgrade tika to version 0.6

Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=905229&r1=905228&r2=905229&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Feb  1 10:03:07 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-781 Update Tika to v0.6 (jnioche)
+
 * NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count 
(stack + jnioche)
 
 * NUTCH-655 Injecting Crawl metadata (jnioche)




svn commit: r905550 [1/2] - /lucene/nutch/trunk/conf/tika-mimetypes.xml

2010-02-02 Thread jnioche
Author: jnioche
Date: Tue Feb  2 09:31:19 2010
New Revision: 905550

URL: http://svn.apache.org/viewvc?rev=905550&view=rev
Log:
NUTCH-781 : updated tika-mimetypes.xml

Modified:
lucene/nutch/trunk/conf/tika-mimetypes.xml



svn commit: r906907 - in /lucene/nutch/trunk: CHANGES.txt conf/domain-suffixes.xml

2010-02-05 Thread jnioche
Author: jnioche
Date: Fri Feb  5 11:52:57 2010
New Revision: 906907

URL: http://svn.apache.org/viewvc?rev=906907&view=rev
Log:
NUTCH-786

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/domain-suffixes.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=906907&r1=906906&r2=906907&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Feb  5 11:52:57 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-786 Improvement to the list of suffix domains (jnioche)
+
 * NUTCH-775 Enhance searcher interface (siren)
 
 * NUTCH-781 Update Tika to v0.6 (jnioche)

Modified: lucene/nutch/trunk/conf/domain-suffixes.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/domain-suffixes.xml?rev=906907&r1=906906&r2=906907&view=diff
==
--- lucene/nutch/trunk/conf/domain-suffixes.xml (original)
+++ lucene/nutch/trunk/conf/domain-suffixes.xml Fri Feb  5 11:52:57 2010
@@ -1744,6 +1744,16 @@
 
 
 
+
+
+
+
+
+
+
+
+
+
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 

svn commit: r910187 - /lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java

2010-02-15 Thread jnioche
Author: jnioche
Date: Mon Feb 15 09:41:05 2010
New Revision: 910187

URL: http://svn.apache.org/viewvc?rev=910187&view=rev
Log:
NUTCH-766: small improvement to Tika parser : prioritise default Tika parser 
when discovering plugins matching mime-type

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=910187&r1=910186&r2=910187&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Mon 
Feb 15 09:41:05 2010
@@ -343,11 +343,14 @@
   // NotMappedParserException
   
   for (int i=0; i 0) {




svn commit: r910454 - in /lucene/nutch/trunk/src/plugin/languageidentifier/src: java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.j

2010-02-16 Thread jnioche
Author: jnioche
Date: Tue Feb 16 10:20:22 2010
New Revision: 910454

URL: http://svn.apache.org/viewvc?rev=910454&view=rev
Log:
NUTCH-794 : Language Identification must use check the parse metadata for 
language values

Modified:

lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java

lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=910454&r1=910453&r2=910454&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 Tue Feb 16 10:20:22 2010
@@ -91,15 +91,33 @@
 
 Parse parse = parseResult.get(content.getUrl());
 
+String lang = getLanguageFromMetadata(parse.getData().getParseMeta());
+if (lang != null) {
+  parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
+  return parseResult;
+}
+
 // Trying to find the document's language
 LanguageParser parser = new LanguageParser(doc);
-String lang = parser.getLanguage();
+lang = parser.getLanguage();
 
 if (lang != null) {
   parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
 }
 return parseResult;
   }
+  
+  // Check in the metadata whether the language has already been stored there 
by Tika
+  private static String getLanguageFromMetadata(Metadata parseMD){
+// dublin core 
+String lang = parseMD.get("dc.language");
+if (lang!=null) return lang;
+// meta content-language
+lang = parseMD.get("content-language");
+if (lang!=null) return lang;
+// lang attribute
+return parseMD.get("lang");
+  }
 
   static class LanguageParser {
 

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=910454&r1=910453&r2=910454&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
 Tue Feb 16 10:20:22 2010
@@ -40,7 +40,8 @@
   "document 2 titlethis is 
english",
   "document 3 
titlethis is english" };
 
-  String metalanguages[] = { "fi", "en", "en" };
+  // NUTCH-794 : temporarily replaced "fi" and "en" with null
+  String metalanguages[] = { null, "en", "en" };
 
   /**
* Test parsing of language identifiers from html 




svn commit: r911905 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java

2010-02-19 Thread jnioche
Author: jnioche
Date: Fri Feb 19 18:49:49 2010
New Revision: 911905

URL: http://svn.apache.org/viewvc?rev=911905&view=rev
Log:
NUTCH-719 fetchQueues.totalSize incorrect in Fetcher

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=911905&r1=911904&r2=911905&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Feb 19 18:49:49 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via 
jnioche) 
+
 * NUTCH-790 Some external javadoc links are broken (siren)
 
 * NUTCH-766 Tika parser (jnioche via mattmann)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=911905&r1=911904&r2=911905&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Feb 
19 18:49:49 2010
@@ -338,7 +338,7 @@
   if (it != null) addFetchItem(it);
 }
 
-public void addFetchItem(FetchItem it) {
+public synchronized void addFetchItem(FetchItem it) {
   FetchItemQueue fiq = getFetchItemQueue(it.queueID);
   fiq.addFetchItem(it);
   totalSize.incrementAndGet();




svn commit: r917557 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/parse/HtmlParseFilters.java

2010-03-01 Thread jnioche
Author: jnioche
Date: Mon Mar  1 15:08:05 2010
New Revision: 917557

URL: http://svn.apache.org/viewvc?rev=917557&view=rev
Log:
NUTCH-782: Ability to order htmlparsefilters

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=917557&r1=917556&r2=917557&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar  1 15:08:05 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-782 Ability to order htmlparsefilters (jnioche)
+
 * NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via 
jnioche) 
 
 * NUTCH-790 Some external javadoc links are broken (siren)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=917557&r1=917556&r2=917557&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar  1 15:08:05 2010
@@ -996,6 +996,18 @@
   for most people would be "img,script,link".
 
 
+
+  htmlparsefilter.order
+  
+  The order by which HTMLParse filters are applied.
+  If empty, all available HTMLParse filters (as dictated by properties
+  plugin-includes and plugin-excludes above) are loaded and applied in system
+  defined order. If not empty, only named filters are loaded and applied
+  in given order.
+  HTMLParse filter ordering MAY have an impact
+  on end result, as some filters could rely on the metadata generated by a 
previous filter.
+  
+
 
 
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=917557&r1=917556&r2=917557&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java 
Mon Mar  1 15:08:05 2010
@@ -17,6 +17,7 @@
 
 package org.apache.nutch.parse;
 
+import java.util.ArrayList;
 import java.util.HashMap;
 
 import org.apache.nutch.protocol.Content;
@@ -30,12 +31,23 @@
 public class HtmlParseFilters {
 
   private HtmlParseFilter[] htmlParseFilters;
+  
+  public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order";
 
   public HtmlParseFilters(Configuration conf) {
+String order = conf.get(HTMLPARSEFILTER_ORDER);
 ObjectCache objectCache = ObjectCache.get(conf);
 this.htmlParseFilters = (HtmlParseFilter[]) 
objectCache.getObject(HtmlParseFilter.class.getName());
 if (htmlParseFilters == null) {
-HashMap filters =
+  /*
+   * If ordered filters are required, prepare array of filters based on
+   * property
+   */
+  String[] orderedFilters = null;
+  if (order != null && !order.trim().equals("")) {
+orderedFilters = order.split("\\s+");
+  }
+HashMap filterMap =
   new HashMap();
 try {
 ExtensionPoint point = 
PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.X_POINT_ID);
@@ -45,12 +57,31 @@
 for (int i = 0; i < extensions.length; i++) {
 Extension extension = extensions[i];
 HtmlParseFilter parseFilter = (HtmlParseFilter) 
extension.getExtensionInstance();
-if 
(!filters.containsKey(parseFilter.getClass().getName())) {
-filters.put(parseFilter.getClass().getName(), 
parseFilter);
+if 
(!filterMap.containsKey(parseFilter.getClass().getName())) {
+filterMap.put(parseFilter.getClass().getName(), 
parseFilter);
 }
 }
-HtmlParseFilter[] htmlParseFilters = 
filters.values().toArray(new HtmlParseFilter[filters.size()]);
-objectCache.setObject(HtmlParseFilter.class.getName(), 
htmlParseFilters);
+HtmlParseFilter[] htmlParseFilters = 
filterMap.values().toArray(new HtmlParseFilter[filterMap.size()]);
+/*
+ * If no ordered filters required, just get the filters in an
+ * indeterminate order
+ */
+if (orderedFilters == null) {
+  objectCache.setObject(HtmlParseFilter.class.getName(), 
htmlParseFilters);
+}
+/* Otherwise run the filters in the requ

svn commit: r919358 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/solr/SolrIndexer.java src/java/org/apache/nutch/indexer/solr/SolrWriter.java

2010-03-05 Thread jnioche
Author: jnioche
Date: Fri Mar  5 10:09:08 2010
New Revision: 919358

URL: http://svn.apache.org/viewvc?rev=919358&view=rev
Log:
NUTCH-799 SOLRIndexer to commit once all reducers have finished

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=919358&r1=919357&r2=919358&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Mar  5 10:09:08 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche)
+
 * NUTCH-782 Ability to order htmlparsefilters (jnioche)
 
 * NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via 
jnioche) 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=919358&r1=919357&r2=919358&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java 
Fri Mar  5 10:09:08 2010
@@ -37,6 +37,8 @@
 import org.apache.nutch.indexer.NutchIndexWriterFactory;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
 
 public class SolrIndexer extends Configured implements Tool {
 
@@ -71,6 +73,12 @@
 FileOutputFormat.setOutputPath(job, tmp);
 try {
   JobClient.runJob(job);
+  // do the commits once and for all the reducers in one go
+  SolrServer solr =  new CommonsHttpSolrServer(solrUrl);
+  solr.commit();
+} 
+catch (Exception e){
+  LOG.error(e);
 } finally {
   FileSystem.get(job).delete(tmp, true);
 }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=919358&r1=919357&r2=919358&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java 
Fri Mar  5 10:09:08 2010
@@ -74,7 +74,7 @@
 solr.add(inputDocs);
 inputDocs.clear();
   }
-  solr.commit();
+  // solr.commit();
 } catch (final SolrServerException e) {
   throw makeIOException(e);
 }




svn commit: r921831 - in /lucene/nutch/trunk: ./ lib/

2010-03-11 Thread jnioche
Author: jnioche
Date: Thu Mar 11 13:06:12 2010
New Revision: 921831

URL: http://svn.apache.org/viewvc?rev=921831&view=rev
Log:
NUTCH-798 : Upgrade to SOLR1.4 and its dependencies

Added:
lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar   (with props)
lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar   (with props)
lucene/nutch/trunk/lib/commons-httpclient-3.1.jar   (with props)
lucene/nutch/trunk/lib/commons-io-1.4.jar   (with props)
lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar   (with props)
lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar   (with props)
lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar   (with props)
lucene/nutch/trunk/lib/wstx-asl-3.2.7.jar   (with props)
Removed:
lucene/nutch/trunk/lib/apache-solr-common-1.3.0.jar
lucene/nutch/trunk/lib/apache-solr-solrj-1.3.0.jar
lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar
lucene/nutch/trunk/lib/slf4j-api-1.4.3.jar
Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=921831&r1=921830&r2=921831&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Mar 11 13:06:12 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-798 Upgrade to SOLR1.4 and its dependencies (jnioche)
+
 * NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche)
 
 * NUTCH-782 Ability to order htmlparsefilters (jnioche)

Added: lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar?rev=921831&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar?rev=921831&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/commons-httpclient-3.1.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-httpclient-3.1.jar?rev=921831&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/commons-httpclient-3.1.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/commons-io-1.4.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-io-1.4.jar?rev=921831&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/commons-io-1.4.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar?rev=921831&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar?rev=921831&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar?rev=921831&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/wstx-asl-3.2.7.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk

svn commit: r921840 - in /lucene/nutch/trunk: CHANGES.txt conf/parse-plugins.xml src/plugin/build.xml src/plugin/parse-mp3/ src/plugin/parse-rtf/

2010-03-11 Thread jnioche
Author: jnioche
Date: Thu Mar 11 13:25:44 2010
New Revision: 921840

URL: http://svn.apache.org/viewvc?rev=921840&view=rev
Log:
NUTCH-801 Remove RTF and MP3 parse plugins

Removed:
lucene/nutch/trunk/src/plugin/parse-mp3/
lucene/nutch/trunk/src/plugin/parse-rtf/
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/parse-plugins.xml
lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=921840&r1=921839&r2=921840&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Mar 11 13:25:44 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-801 Remove RTF and MP3 parse plugins (jnioche)
+
 * NUTCH-798 Upgrade to SOLR1.4 and its dependencies (jnioche)
 
 * NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche)

Modified: lucene/nutch/trunk/conf/parse-plugins.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/parse-plugins.xml?rev=921840&r1=921839&r2=921840&view=diff
==
--- lucene/nutch/trunk/conf/parse-plugins.xml (original)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Thu Mar 11 13:25:44 2010
@@ -124,13 +124,11 @@

 

-   
-   
+   

 

-   
-   
+   

 

@@ -198,8 +196,6 @@


-   



-
-   
+   

http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=921840&r1=921839&r2=921840&view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Thu Mar 11 13:25:44 2010
@@ -52,14 +52,12 @@
  
  
  
- 
  
  
  
  
  
  
- 
  
  
  




svn commit: r926003 - in /lucene/nutch/trunk: ./ conf/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/pro

2010-03-22 Thread jnioche
Author: jnioche
Date: Mon Mar 22 09:00:11 2010
New Revision: 926003

URL: http://svn.apache.org/viewvc?rev=926003&view=rev
Log:
NUTCH-740 Configuration option to override default language for fetched pages

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml

lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=926003&r1=926002&r2=926003&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 22 09:00:11 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-740 Configuration option to override default language for fetched 
pages (Marcin Okraszewski via jnioche)
+
 * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab)
 
 * NUTCH-787 Upgrade Lucene to 3.0.1. (Dawid Weiss via ab)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=926003&r1=926002&r2=926003&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 22 09:00:11 2010
@@ -228,6 +228,15 @@
   
 
 
+
+  http.accept.language
+  en-us,en-gb,en;q=0.7,*;q=0.3
+  Value of the "Accept-Language" request header field.
+  This allows selecting non-English language as default one to retrieve.
+  It is a useful setting for search engines build for certain national group.
+  
+
+
 
 
 

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=926003&r1=926002&r2=926003&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Mon Mar 22 09:00:11 2010
@@ -93,6 +93,8 @@ public abstract class HttpBase implement
 "http://lucene.apache.org/nutch/bot.html";,
 "nutch-ag...@lucene.apache.org");
 
+  /** The "Accept-Language" request header value. */
+  protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
 
   /**
* Maps from host to a Long naming the time it should be unblocked.
@@ -162,6 +164,7 @@ public abstract class HttpBase implement
 this.maxThreadsPerHost = conf.getInt("fetcher.threads.per.host", 1);
 this.userAgent = getAgentString(conf.get("http.agent.name"), 
conf.get("http.agent.version"), conf
 .get("http.agent.description"), conf.get("http.agent.url"), 
conf.get("http.agent.email"));
+this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
 this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) 
* 1000);
 this.maxCrawlDelay = (long)(conf.getInt("fetcher.max.crawl.delay", -1) 
* 1000);
 // backward-compatible default setting
@@ -326,6 +329,13 @@ public abstract class HttpBase implement
 return userAgent;
   }
   
+  /** Value of "Accept-Language" request header sent by Nutch.
+   * @return The value of the header "Accept-Language" header.
+   */
+  public String getAcceptLanguage() {
+ return acceptLanguage;
+  }
+
   public boolean getUseHttp11() {
 return useHttp11;
   }
@@ -470,6 +480,7 @@ public abstract class HttpBase implement
   logger.info("http.timeout = " + timeout);
   logger.info("http.content.limit = " + maxContent);
   logger.info("http.agent = " + userAgent);
+  logger.info("http.accept.language = " + acceptLanguage);
   logger.info(Protocol.CHECK_BLOCKING + " = " + checkBlocking);
   logger.info(Protocol.CHECK_ROBOTS + " = " + checkRobots);
   if (checkBlocking) {

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=926003&r1=926002&r2=926003&view=diff
=

svn commit: r926155 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/tools/ src/test/org/apache/nutch/crawl/ src/test/org/ap

2010-03-22 Thread jnioche
Author: jnioche
Date: Mon Mar 22 16:19:12 2010
New Revision: 926155

URL: http://svn.apache.org/viewvc?rev=926155&view=rev
Log:
NUTCH-762 : Generator can generate several segments in one parse of the crawlDB

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java
Removed:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=926155&r1=926154&r2=926155&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 22 16:19:12 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-762 Generator can generate several segments in one parse of the 
crawlDB (jnioche)
+
 * NUTCH-740 Configuration option to override default language for fetched 
pages (Marcin Okraszewski via jnioche)
 
 * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=926155&r1=926154&r2=926155&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 22 16:19:12 2010
@@ -514,24 +514,21 @@
 
 
 
-  generate.max.per.host
+  generate.max.count
   -1
-  The maximum number of urls per host in a single
-  fetchlist.  -1 if unlimited.
+  The maximum number of urls in a single
+  fetchlist.  -1 if unlimited. The urls are counted according
+  to the value of the parameter generator.count.mode.
+  
 
 
 
-  generate.max.per.host.by.ip
-  false
-  If false, same host names are counted. If true,
-  hosts' IP addresses are resolved and the same IP-s are counted.
-  
-  -+-+-+- WARNING !!! -+-+-+-
-  When set to true, Generator will create a lot of DNS lookup
-  requests, rapidly. This may cause a DOS attack on
-  remote DNS servers, not to mention increased external traffic
-  and latency. For these reasons when using this option it is
-  required that a local caching DNS be used.
+  generate.count.mode
+  host
+  Determines how the URLs are counted for generator.max.count.
+  Default value is 'host' but can be 'domain'. Note that we do not count 
+  per IP in the new version of the Generator.
+  
 
 
 
@@ -545,6 +542,34 @@
   updatedb will generate identical fetchlists.
 
 
+
+  generate.max.per.host
+  -1
+  (Deprecated). Use generate.max.count and generate.count.mode 
instead.
+  The maximum number of urls per host in a single
+  fetchlist.  -1 if unlimited.
+
+
+
+
+  partition.url.mode
+  byHost
+  Determines how to partition URLs. Default value is 'byHost', 
+  also takes 'byDomain' or 'byIP'. 
+  
+
+
+
+  crawl.gen.delay
+  60480
+  
+   This value, expressed in days, defines how long we should keep the lock on 
records 
+   in CrawlDb that were just selected for fetching. If these records are not 
updated 
+   in the meantime, the lock is canceled, i.e. the become eligible for 
selecting. 
+   Default value of this is 7 days.
+  
+
+
 
 
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=926155&r1=926154&r2=926155&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Mon Mar 22 
16:19:12 2010
@@ -124,17 +124,17 @@ public class Crawl {
 injector.inject(crawlDb, rootUrlDir);
 int i;
 for (i = 0; i < depth; i++) { // generate new segment
-  Path segment = generator.generate(crawlDb, segments, -1, topN, System
+  Path[] segs = generator.generate(crawlDb, segments, -1, topN, System
   .currentTimeMillis());
-  if (segment == null) {
+  if (segments == null) {
 LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
 break;
   }
-  fetcher.fetch(segment, threads, 
org.apache.nutch.fetcher.Fetcher.isParsing(conf));  // fetch it
+  fetcher.fetch(segs[0], threads, 
org.apache.nutch.fe

svn commit: r926163 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

2010-03-22 Thread jnioche
Author: jnioche
Date: Mon Mar 22 16:29:30 2010
New Revision: 926163

URL: http://svn.apache.org/viewvc?rev=926163&view=rev
Log:
fixed NPE introduced in NUTCH-762

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=926163&r1=926162&r2=926163&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon Mar 
22 16:29:30 2010
@@ -480,7 +480,7 @@ public class Generator extends Configure
   LOG.info("Generator: topN: " + topN);
 }
 
-if (getConf().get(GENERATE_MAX_PER_HOST_BY_IP).equals("true")){
+if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))){
   LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use 
partition.url.mode instead");
 }
 




svn commit: r928746 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/tools/CrawlDBScanner.java

2010-03-29 Thread jnioche
Author: jnioche
Date: Mon Mar 29 12:12:09 2010
New Revision: 928746

URL: http://svn.apache.org/viewvc?rev=928746&view=rev
Log:
NUTCH-784 : CrawlDBScanner

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java
Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=928746&r1=928745&r2=928746&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 29 12:12:09 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-784 CrawlDBScanner (jnioche)
+
 * NUTCH-762 Generator can generate several segments in one parse of the 
crawlDB (jnioche)
 
 * NUTCH-740 Configuration option to override default language for fetched 
pages (Marcin Okraszewski via jnioche)

Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java?rev=928746&view=auto
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java Mon 
Mar 29 12:12:09 2010
@@ -0,0 +1,165 @@
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+/**
+ * Dumps all the entries matching a regular expression on their URL. Generates 
a
+ * text representation of the CrawlDatum-s or binary objects which can then be
+ * used as a new CrawlDB. The dump mechanism of the crawldb reader is not very
+ * useful on large crawldbs as the ouput can be extremely large and the -url
+ * function can't help if we don't know what url we want to have a look at.
+ * 
+ * @author : Julien Nioche
+ */
+
+public class CrawlDBScanner extends Configured implements Tool,
+Mapper, 
Reducer {
+
+  public static final Log LOG = LogFactory.getLog(CrawlDBScanner.class);
+
+  public CrawlDBScanner() {}
+
+  public CrawlDBScanner(Configuration conf) {
+setConf(conf);
+  }
+
+  public void close() {}
+
+  private String regex = null;
+  private String status = null;
+
+  public void configure(JobConf job) {
+regex = job.get("CrawlDBScanner.regex");
+status = job.get("CrawlDBScanner.status");
+  }
+
+  public void map(Text url, CrawlDatum crawlDatum,
+  OutputCollector output, Reporter reporter) throws 
IOException {
+
+// check status
+if (status != null
+&& 
!status.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus( 
return;
+
+// if URL matched regexp dump it
+if (url.toString().matches(regex)) {
+  output.collect(url, crawlDatum);
+}
+  }
+
+  public void reduce(Text key, Iterator values,
+  OutputCollector output, Reporter reporter) throws 
IOException {
+while (values.hasNext()) {
+  CrawlDatum val = values.next();
+  output.collect(key, val);
+}
+  }
+
+  private void scan(Path crawlDb, Path outputPath, String regex, String status,
+  boolean text) throws IOException {
+
+JobConf job = new NutchJob(getConf());
+
+job.setJobName("Scan : " + crawlDb + " for URLS matching : " + regex);
+
+job.set("CrawlDBScanner.regex", regex);
+if (status != null) job.set("CrawlDBScanner.status", status);
+
+FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
+job.setInputFormat(SequenceFileInputFormat.class);
+
+job.setMapperClass(CrawlDBScanner.class);
+job.setReducerClass(CrawlDBScanner.class);
+
+FileOutputFormat.setOutputPath(job, outputPath);
+
+// if we want a text dump of the entries
+// in or

svn commit: r929038 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/

2010-03-30 Thread jnioche
Author: jnioche
Date: Tue Mar 30 08:30:28 2010
New Revision: 929038

URL: http://svn.apache.org/viewvc?rev=929038&view=rev
Log:
NUTCH-779 Mechanism for passing metadata from parse to crawldb

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=929038&r1=929037&r2=929038&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Mar 30 08:30:28 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche)
+
 * NUTCH-784 CrawlDBScanner (jnioche)
 
 * NUTCH-762 Generator can generate several segments in one parse of the 
crawlDB (jnioche)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=929038&r1=929037&r2=929038&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Tue Mar 30 08:30:28 2010
@@ -479,6 +479,15 @@
   
 
 
+ 
+  db.parsemeta.to.crawldb
+  
+  Comma-separated list of parse metadata keys to transfer to the 
crawldb (NUTCH-779).
+   Assuming for instance that the languageidentifier plugin is enabled, 
setting the value to 'lang' 
+   will copy both the key 'lang' and its value to the corresponding entry in 
the crawldb.
+  
+
+
 
   db.fetch.retry.max
   3

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=929038&r1=929037&r2=929038&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Tue Mar 
30 08:30:28 2010
@@ -82,6 +82,8 @@ public class CrawlDatum implements Writa
   public static final byte STATUS_INJECTED  = 0x42;
   /** Page discovered through a link. */
   public static final byte STATUS_LINKED= 0x43;
+  /** Page got metadata from a parser */
+  public static final byte STATUS_PARSE_META= 0x44;
   
   
   public static final HashMap statNames = new HashMap();
@@ -101,6 +103,7 @@ public class CrawlDatum implements Writa
 statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm");
 statNames.put(STATUS_FETCH_GONE, "fetch_gone");
 statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
+statNames.put(STATUS_PARSE_META, "parse_metadata");
 
 oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
 oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=929038&r1=929037&r2=929038&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Tue 
Mar 30 08:30:28 2010
@@ -20,6 +20,7 @@ package org.apache.nutch.crawl;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map.Entry;
 import java.io.IOException;
 
 // Commons Logging imports
@@ -71,7 +72,8 @@ public class CrawlDbReducer implements R
 byte[] signature = null;
 boolean multiple = false; // avoid deep copy when only single value exists
 linked.clear();
-
+org.apache.hadoop.io.MapWritable metaFromParse = null;
+
 while (values.hasNext()) {
   CrawlDatum datum = (CrawlDatum)values.next();
   if (!multiple && values.hasNext()) multiple = true;
@@ -120,6 +122,9 @@ public class CrawlDbReducer implements R
   case CrawlDatum.STATUS_SIGNATURE:
 signature = datum.getSignature();
 break;
+  case CrawlDatum.STATUS_PARSE_META:
+metaFromParse = datum.getMetaData();
+break;
   default:
 LOG.warn("Unknown status, key: " + key + ", datum: " + datum);
   }
@@ -233,6 +238,11 @@ public class CrawlDbReducer implements R
   else result.setStatus(CrawlDat

svn commit: r929039 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java

2010-03-30 Thread jnioche
Author: jnioche
Date: Tue Mar 30 08:35:49 2010
New Revision: 929039

URL: http://svn.apache.org/viewvc?rev=929039&view=rev
Log:
NUTCH 785 : Fetcher : copy metadata from origin URL when redirecting + call 
scfilters.initialScore on newly created URL

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=929039&r1=929038&r2=929039&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Mar 30 08:35:49 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call 
scfilters.initialScore on newly created URL (jnioche)
+
 * NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche)
 
 * NUTCH-784 CrawlDBScanner (jnioche)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=929039&r1=929038&r2=929039&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Mar 
30 08:35:49 2010
@@ -46,6 +46,7 @@ import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.*;
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.parse.*;
+import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.*;
 
@@ -656,6 +657,9 @@ public class Fetcher extends Configured 
   if (redirUrl != null) {
 CrawlDatum newDatum = new 
CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
 fit.datum.getFetchInterval(), fit.datum.getScore());
+// transfer existing metadata to the redir
+newDatum.getMetaData().putAll(fit.datum.getMetaData());
+scfilters.initialScore(redirUrl, newDatum);
 if (reprUrl != null) {
   newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
   new Text(reprUrl));
@@ -694,6 +698,9 @@ public class Fetcher extends Configured 
 if (redirUrl != null) {
   CrawlDatum newDatum = new 
CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
   fit.datum.getFetchInterval(), fit.datum.getScore());
+  // transfer existing metadata
+  newDatum.getMetaData().putAll(fit.datum.getMetaData());
+  scfilters.initialScore(redirUrl, newDatum);
   if (reprUrl != null) {
 newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
 new Text(reprUrl));
@@ -809,6 +816,13 @@ public class Fetcher extends Configured 
 } else {
   CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED,
   datum.getFetchInterval());
+  // transfer existing metadata 
+  newDatum.getMetaData().putAll(datum.getMetaData());
+  try {
+scfilters.initialScore(url, newDatum);
+  } catch (ScoringFilterException e) {
+e.printStackTrace();
+  }
   if (reprUrl != null) {
 newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
 new Text(reprUrl));




svn commit: r931098 - in /lucene/nutch/trunk: ./ conf/ lib/ src/plugin/ src/plugin/parse-tika/ src/plugin/parse-tika/lib/ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/

2010-04-06 Thread jnioche
Author: jnioche
Date: Tue Apr  6 11:38:26 2010
New Revision: 931098

URL: http://svn.apache.org/viewvc?rev=931098&view=rev
Log:
NUTCH-810 Upgraded to Tika 0.7

Added:
lucene/nutch/trunk/lib/tika-core-0.7.jar   (with props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar   (with 
props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar   (with 
props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar   (with 
props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar   (with 
props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar   (with props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar   (with 
props)
Removed:
lucene/nutch/trunk/lib/tika-core-0.6.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-0.8.0-incubator.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-0.8.0-incubator.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-0.8.0-incubating.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.6.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/tika-mimetypes.xml
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml
lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml

lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java

lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=931098&r1=931097&r2=931098&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Apr  6 11:38:26 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-810 Upgrade to Tika 0.7 (jnioche)
+
 * NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call 
scfilters.initialScore on newly created URL (jnioche)
 
 * NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche)

Modified: lucene/nutch/trunk/conf/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/tika-mimetypes.xml?rev=931098&r1=931097&r2=931098&view=diff
==
--- lucene/nutch/trunk/conf/tika-mimetypes.xml (original)
+++ lucene/nutch/trunk/conf/tika-mimetypes.xml Tue Apr  6 11:38:26 2010
@@ -2198,7 +2198,11 @@
 
   
 
-  
+  
+  
+  
+  
+  
 
 
   
@@ -3551,7 +3555,13 @@
   bad HTML, unfortunately.
  -->
 
+
 
+
+
+
+
+
 
   
   

Added: lucene/nutch/trunk/lib/tika-core-0.7.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.7.jar?rev=931098&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/tika-core-0.7.jar
--
svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=931098&r1=931097&r2=931098&view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Apr  6 11:38:26 2010
@@ -32,8 +32,8 @@
  
  
  
-
-
+ 
+ 
  
  
  
@@ -65,12 +65,12 @@
  
  
  
-
+ 
  
  
  
  
-
+ 
  
  
  
@@ -99,7 +99,6 @@
  
  
  
- 
  
  
  
@@ -107,7 +106,6 @@
  
  
  
- 
  
  
  
@@ -172,11 +170,11 @@
 
 
 
-   
+
 
 
 
-   
+
 
 
 

Modified: lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=931098&r1=931097&r2=931098&view=diff
==
--- lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Apr  6 11:38:26 2010
@@ -1,7 +1,7 @@
 
 
 
-   
+   




Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar?rev=931098&view=auto
==
Binary file - no diff available.

Propchange: luc