svn commit: r934694 - in /lucene/nutch/trunk/src: java/org/apache/nutch/indexer/solr/ java/org/apache/nutch/searcher/ java/org/apache/nutch/tools/ plugin/parse-tika/src/java/org/apache/nutch/parse/tik

2010-04-15 Thread siren
Author: siren
Date: Fri Apr 16 05:42:28 2010
New Revision: 934694

URL: http://svn.apache.org/viewvc?rev=934694&view=rev
Log:
add missing license headers

Modified:

lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java

lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
lucene/nutch/trunk/src/test/org/apache/nutch/searcher/QueryParamsTest.java
lucene/nutch/trunk/src/test/org/apache/nutch/util/TestEncodingDetector.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=934694&r1=934693&r2=934694&view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
 (original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
 Fri Apr 16 05:42:28 2010
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.indexer.solr;
 
 import java.io.DataInput;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java?rev=934694&r1=934693&r2=934694&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java Fri 
Apr 16 05:42:28 2010
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.searcher;
 
 import java.io.DataInput;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java?rev=934694&r1=934693&r2=934694&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java Fri 
Apr 16 05:42:28 2010
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.tools;
 
 import java.io.IO

svn commit: r910173 - /lucene/nutch/trunk/src/web/jsp/search.jsp

2010-02-15 Thread siren
Author: siren
Date: Mon Feb 15 08:09:53 2010
New Revision: 910173

URL: http://svn.apache.org/viewvc?rev=910173&view=rev
Log:
NUTCH-793 search.jsp compile errors

Modified:
lucene/nutch/trunk/src/web/jsp/search.jsp

Modified: lucene/nutch/trunk/src/web/jsp/search.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/search.jsp?rev=910173&r1=910172&r2=910173&view=diff
==
--- lucene/nutch/trunk/src/web/jsp/search.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/search.jsp Mon Feb 15 08:09:53 2010
@@ -204,7 +204,7 @@
 // position this is good, bad?... ugly?
Hits hits;
try{
-  query.getParams.initFrom(start + hitsToRetrieve, hitsPerSite, "site", 
sort, reverse);
+  query.getParams().initFrom(start + hitsToRetrieve, hitsPerSite, "site", 
sort, reverse);
  hits = bean.search(query);
} catch (IOException e){
  hits = new Hits(0,new Hit[0]);




svn commit: r910044 - in /lucene/nutch/trunk: conf/nutch-default.xml default.properties

2010-02-14 Thread siren
Author: siren
Date: Sun Feb 14 17:13:29 2010
New Revision: 910044

URL: http://svn.apache.org/viewvc?rev=910044&view=rev
Log:
NUTCH-792 update version

Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/default.properties

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=910044&r1=910043&r2=910044&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sun Feb 14 17:13:29 2010
@@ -113,7 +113,7 @@
 
 
   http.agent.version
-  Nutch-1.0
+  Nutch-1.1-dev
   A version string to advertise in the User-Agent 
header.
 

Modified: lucene/nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/default.properties?rev=910044&r1=910043&r2=910044&view=diff
==
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Sun Feb 14 17:13:29 2010
@@ -1,6 +1,6 @@
 Name=Nutch
 name=nutch
-version=1.0
+version=1.1-dev
 final.name=${name}-${version}
 year=2006
 




svn commit: r910041 - in /lucene/nutch/trunk: CHANGES.txt default.properties

2010-02-14 Thread siren
Author: siren
Date: Sun Feb 14 17:02:55 2010
New Revision: 910041

URL: http://svn.apache.org/viewvc?rev=910041&view=rev
Log:
NUTCH-790 Some external javadoc links are broken

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/default.properties

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=910041&r1=910040&r2=910041&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Feb 14 17:02:55 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-790 Some external javadoc links are broken (siren)
+
 * NUTCH-766 Tika parser (jnioche via mattmann)
 
 * NUTCH-786 Improvement to the list of suffix domains (jnioche)

Modified: lucene/nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/default.properties?rev=910041&r1=910040&r2=910041&view=diff
==
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Sun Feb 14 17:02:55 2010
@@ -35,9 +35,9 @@
 # Proxy Host and Port to use for building JavaDoc
 javadoc.proxy.host=-J-DproxyHost=
 javadoc.proxy.port=-J-DproxyPort=
-javadoc.link.java=http://java.sun.com/j2se/1.4.2/docs/api/
-javadoc.link.lucene=http://jakarta.apache.org/lucene/docs/api/
-javadoc.link.hadoop=http://lucene.apache.org/hadoop/docs/api/
+javadoc.link.java=http://java.sun.com/javase/6/docs/api/
+javadoc.link.lucene=http://lucene.apache.org/java/2_9_1/api/all
+javadoc.link.hadoop=http://hadoop.apache.org/common/docs/r0.20.1/api/
 javadoc.packages=org.apache.nutch.*
 
 dist.dir=${build.dir}/${final.name}




svn commit: r905410 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/searcher/ src/test/org/apache/nutch/searcher/ src/web/jsp/

2010-02-01 Thread siren
Author: siren
Date: Mon Feb  1 20:47:34 2010
New Revision: 905410

URL: http://svn.apache.org/viewvc?rev=905410&view=rev
Log:
NUTCH-775 Enhance Searcher interface

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java
lucene/nutch/trunk/src/test/org/apache/nutch/searcher/QueryParamsTest.java
Modified:
lucene/nutch/trunk/CHANGES.txt

lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneSearchBean.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java
lucene/nutch/trunk/src/web/jsp/search.jsp

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=905410&r1=905409&r2=905410&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Feb  1 20:47:34 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-775 Enhance searcher interface (siren)
+
 * NUTCH-781 Update Tika to v0.6 (jnioche)
 
 * NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count 
(stack + jnioche)

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java?rev=905410&r1=905409&r2=905410&view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java
 (original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java
 Mon Feb  1 20:47:34 2010
@@ -49,10 +49,6 @@
 private int id;
 
 private Query query;
-private int numHits;
-private String dedupField;
-private String sortField;
-private boolean reverse;
 
 public SearchTask(int id) {
   this.id = id;
@@ -62,16 +58,20 @@
   if (!liveServers[id]) {
 return null;
   }
-  return beans[id].search(query, numHits, dedupField, sortField, reverse);
+  return beans[id].search(query);
 }
 
+/**
+ * @deprecated since 1.1, use {...@link #setSearchArgs(Query)} instead
+ */
 public void setSearchArgs(Query query, int numHits, String dedupField,
   String sortField, boolean reverse) {
   this.query = query;
-  this.numHits = numHits;
-  this.dedupField = dedupField;
-  this.sortField = sortField;
-  this.reverse = reverse;
+  query.setParams(new QueryParams(numHits, 
QueryParams.DEFAULT_MAX_HITS_PER_DUP, dedupField, sortField, reverse));
+}
+
+private void setSearchArgs(Query query) {
+  this.query = query;
 }
 
   }
@@ -199,12 +199,10 @@
 return beans[hit.getIndexNo()].getExplanation(query, hit);
   }
 
-  public Hits search(Query query, int numHits, String dedupField,
- String sortField, boolean reverse) throws IOException {
-
+  @Override
+  public Hits search(Query query) throws IOException {
 for (Callable task : searchTasks) {
-  ((SearchTask)task).setSearchArgs(query, numHits, dedupField, sortField,
-  reverse);
+  ((SearchTask)task).setSearchArgs(query);
 }
 
 List> allHits;
@@ -216,10 +214,12 @@
 }
 
 PriorityQueue queue;// cull top hits from results
-if (sortField == null || reverse) {
-  queue = new PriorityQueue(numHits);
+if (query.getParams().getSortField() == null
+|| query.getParams().isReverse()) {
+  queue = new PriorityQueue(query.getParams().getNumHits());
 } else {
-  queue = new PriorityQueue(numHits, new Comparator() {
+  queue = new PriorityQueue(query.getParams().getNumHits(),
+  new Comparator() {
 public int compare(Hit h1, Hit h2) {
   return h2.compareTo(h1); // reverse natural order
 }
@@ -251,7 +251,8 @@
 Hit newHit = new Hit(i, hit.getUniqueKey(),
  hit.getSortValue(), hit.getDedupValue());
 queue.add(newHit);
-if (queue.size() > numHits) { // if hit queue overfull
+if (queue.size() > query.getParams().getNumHits()) {
+  // if hit queue overfull
   queue.remove();
 }
   }
@@ -265,6 +266,15 @@
 return new Hits(totalHits, culledResults);
   }
 
+  @Override
+  @Deprecated
+  public Hits search(Query query, int numHits, String dedupField,
+ String sortField, boolean reverse) throws IOException {
+
+qu

svn commit: r759345 [2/2] - in /lucene/nutch/trunk: site/ src/site/src/documentation/content/xdocs/

2009-03-27 Thread siren
Modified: lucene/nutch/trunk/site/tutorial.pdf
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/tutorial.pdf?rev=759345&r1=759344&r2=759345&view=diff
==
--- lucene/nutch/trunk/site/tutorial.pdf (original)
+++ lucene/nutch/trunk/site/tutorial.pdf Fri Mar 27 20:50:56 2009
@@ -419,8 +419,8 @@
 62 0 obj
 << /Type /Font
 /Subtype /Type1
-/Name /F3
-/BaseFont /Helvetica-Bold
+/Name /F1
+/BaseFont /Helvetica
 /Encoding /WinAnsiEncoding >>
 endobj
 63 0 obj
@@ -433,15 +433,15 @@
 64 0 obj
 << /Type /Font
 /Subtype /Type1
-/Name /F6
-/BaseFont /Times-Italic
+/Name /F3
+/BaseFont /Helvetica-Bold
 /Encoding /WinAnsiEncoding >>
 endobj
 65 0 obj
 << /Type /Font
 /Subtype /Type1
-/Name /F1
-/BaseFont /Helvetica
+/Name /F2
+/BaseFont /Helvetica-Oblique
 /Encoding /WinAnsiEncoding >>
 endobj
 66 0 obj
@@ -454,8 +454,8 @@
 67 0 obj
 << /Type /Font
 /Subtype /Type1
-/Name /F2
-/BaseFont /Helvetica-Oblique
+/Name /F6
+/BaseFont /Times-Italic
 /Encoding /WinAnsiEncoding >>
 endobj
 68 0 obj
@@ -479,7 +479,7 @@
 endobj
 3 0 obj
 << 
-/Font << /F3 62 0 R /F5 63 0 R /F1 65 0 R /F6 64 0 R /F9 66 0 R /F2 67 0 R /F7 
68 0 R >> 
+/Font << /F1 62 0 R /F5 63 0 R /F3 64 0 R /F2 65 0 R /F9 66 0 R /F6 67 0 R /F7 
68 0 R >> 
 /ProcSet [ /PDF /ImageC /Text ] >> 
 endobj
 9 0 obj
@@ -618,11 +618,11 @@
 016361 0 n 
 016588 0 n 
 016744 0 n 
-016857 0 n 
-016967 0 n 
-017078 0 n 
-017186 0 n 
-017292 0 n 
+016852 0 n 
+016962 0 n 
+017075 0 n 
+017191 0 n 
+017297 0 n 
 017408 0 n 
 trailer
 <<

Modified: lucene/nutch/trunk/site/tutorial8.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/tutorial8.html?rev=759345&r1=759344&r2=759345&view=diff
==
--- lucene/nutch/trunk/site/tutorial8.html (original)
+++ lucene/nutch/trunk/site/tutorial8.html Fri Mar 27 20:50:56 2009
@@ -138,13 +138,16 @@
 i18n
 
 
-API Docs (0.7.2)
+API Docs (1.0)
+
+
+API Docs (0.9)
 
 
 API Docs (0.8.x)
 
 
-API Docs (0.9)
+API Docs (0.7.2)
 
 
 http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API
 Docs (nightly)

Modified: lucene/nutch/trunk/site/tutorial8.pdf
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/tutorial8.pdf?rev=759345&r1=759344&r2=759345&view=diff
==
--- lucene/nutch/trunk/site/tutorial8.pdf (original)
+++ lucene/nutch/trunk/site/tutorial8.pdf Fri Mar 27 20:50:56 2009
@@ -464,8 +464,8 @@
 68 0 obj
 << /Type /Font
 /Subtype /Type1
-/Name /F3
-/BaseFont /Helvetica-Bold
+/Name /F1
+/BaseFont /Helvetica
 /Encoding /WinAnsiEncoding >>
 endobj
 69 0 obj
@@ -478,15 +478,15 @@
 70 0 obj
 << /Type /Font
 /Subtype /Type1
-/Name /F6
-/BaseFont /Times-Italic
+/Name /F3
+/BaseFont /Helvetica-Bold
 /Encoding /WinAnsiEncoding >>
 endobj
 71 0 obj
 << /Type /Font
 /Subtype /Type1
-/Name /F1
-/BaseFont /Helvetica
+/Name /F2
+/BaseFont /Helvetica-Oblique
 /Encoding /WinAnsiEncoding >>
 endobj
 72 0 obj
@@ -499,8 +499,8 @@
 73 0 obj
 << /Type /Font
 /Subtype /Type1
-/Name /F2
-/BaseFont /Helvetica-Oblique
+/Name /F6
+/BaseFont /Times-Italic
 /Encoding /WinAnsiEncoding >>
 endobj
 74 0 obj
@@ -524,7 +524,7 @@
 endobj
 3 0 obj
 << 
-/Font << /F3 68 0 R /F5 69 0 R /F1 71 0 R /F6 70 0 R /F9 72 0 R /F2 73 0 R /F7 
74 0 R >> 
+/Font << /F1 68 0 R /F5 69 0 R /F3 70 0 R /F2 71 0 R /F9 72 0 R /F6 73 0 R /F7 
74 0 R >> 
 /ProcSet [ /PDF /ImageC /Text ] >> 
 endobj
 9 0 obj
@@ -669,11 +669,11 @@
 021535 0 n 
 021762 0 n 
 021918 0 n 
-022031 0 n 
-022141 0 n 
-022252 0 n 
-022360 0 n 
-022466 0 n 
+022026 0 n 
+022136 0 n 
+022249 0 n 
+022365 0 n 
+022471 0 n 
 022582 0 n 
 trailer
 <<

Modified: lucene/nutch/trunk/site/version_control.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/version_control.html?rev=759345&r1=759344&r2=759345&view=diff
==
--- lucene/nutch/trunk/site/version_control.html (original)
+++ lucene/nutch/trunk/site/version_control.html Fri Mar 27 20:50:56 2009
@@ -138,13 +138,16 @@
 i18n
 
 
-API Docs (0.7.2)
+API Docs (1.0)
+
+
+API Docs (0.9)
 
 
 API Docs (0.8.x)
 
 
-API Docs (0.9)
+API Docs (0.7.2)
 
 
 http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API
 Docs (nightly)

Modified: lucene/nutch/trunk/site/version_control.pdf
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/version_control.pdf?rev=759345&r1=759344&r2=759345&view=diff
==
--- lucene/nutch/trunk/site/version_control.pdf (original)
+++ lucene/nutch/trunk/

svn commit: r759328 - /lucene/nutch/tags/release-1.0/

2009-03-27 Thread siren
Author: siren
Date: Fri Mar 27 20:10:11 2009
New Revision: 759328

URL: http://svn.apache.org/viewvc?rev=759328&view=rev
Log:
Nutch 1.0 release.

Added:
lucene/nutch/tags/release-1.0/
  - copied from r759327, lucene/nutch/tags/release-1.0-rc2/



svn commit: r757511 - /lucene/nutch/tags/release-1.0-rc2/

2009-03-23 Thread siren
Author: siren
Date: Mon Mar 23 19:18:47 2009
New Revision: 757511

URL: http://svn.apache.org/viewvc?rev=757511&view=rev
Log:
Nutch 1.0 rc2

Added:
lucene/nutch/tags/release-1.0-rc2/
  - copied from r757510, lucene/nutch/trunk/



svn commit: r757500 - /lucene/nutch/trunk/CHANGES.txt

2009-03-23 Thread siren
Author: siren
Date: Mon Mar 23 18:59:26 2009
New Revision: 757500

URL: http://svn.apache.org/viewvc?rev=757500&view=rev
Log:
update release date

Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=757500&r1=757499&r2=757500&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 23 18:59:26 2009
@@ -1,6 +1,6 @@
 Nutch Change Log
 
-Release 1.0 - 2009-03-10
+Release 1.0 - 2009-03-23
 
  1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)
 




svn commit: r757327 - in /lucene/nutch/trunk: CHANGES.txt README.txt src/plugin/parse-pdf/lib/jai_codec.jar src/plugin/parse-pdf/lib/jai_core.jar src/plugin/parse-pdf/plugin.xml

2009-03-22 Thread siren
Author: siren
Date: Mon Mar 23 06:41:13 2009
New Revision: 757327

URL: http://svn.apache.org/viewvc?rev=757327&view=rev
Log:
NUTCH-722 remove JAI libs

Removed:
lucene/nutch/trunk/src/plugin/parse-pdf/lib/jai_codec.jar
lucene/nutch/trunk/src/plugin/parse-pdf/lib/jai_core.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/README.txt
lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=757327&r1=757326&r2=757327&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 23 06:41:13 2009
@@ -380,6 +380,8 @@
 
 143. NUTCH-715 - Subcollection plugin doesn't work with default
  subcollections.xml file (Dmitry Lihachev via siren)
+ 
+144. NUTCH-722 - Nutch contains JAI jars that we cannot redistribute
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/README.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/README.txt?rev=757327&r1=757326&r2=757327&view=diff
==
--- lucene/nutch/trunk/README.txt (original)
+++ lucene/nutch/trunk/README.txt Mon Mar 23 06:41:13 2009
@@ -1,5 +1,19 @@
 Apache Nutch README
 
+Important note: Due to licensing issues we cannot provide two libraries that
+are normally provided with PDFBox (jai_core.jar, jai_codec.jar), the parser
+library we use for parsing PDF files. If you encounter unexpected problems when
+working with PDF files please
+
+1. download the two missing libraries  from:
+   http://pdfbox.cvs.sourceforge.net/viewvc/pdfbox/pdfbox/external/
+
+2. Put them to directory src/plugin/parse-pdf/lib
+3. follow the instructions in file src/plugin/parse-pdf/plugin.xml
+4. Rebuild nutch.
+
+
+
 Interesting files include:
 
 

Modified: lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml?rev=757327&r1=757326&r2=757327&view=diff
==
--- lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml Mon Mar 23 06:41:13 2009
@@ -30,8 +30,12 @@
   
   
   
+  
+  

 





svn commit: r756218 - /lucene/nutch/trunk/build.xml

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 21:34:47 2009
New Revision: 756218

URL: http://svn.apache.org/viewvc?rev=756218&view=rev
Log:
NUTCH-727

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=756218&r1=756217&r2=756218&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Thu Mar 19 21:34:47 2009
@@ -575,6 +575,7 @@
 
   
 
+
   
 
 




svn commit: r756210 - /lucene/nutch/trunk/KEYS

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 21:26:52 2009
New Revision: 756210

URL: http://svn.apache.org/viewvc?rev=756210&view=rev
Log:
copy keys to trunk

Added:
lucene/nutch/trunk/KEYS
  - copied unchanged from r756209, lucene/nutch/dist/KEYS



svn commit: r756199 - /lucene/nutch/trunk/NOTICE.txt

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 21:10:28 2009
New Revision: 756199

URL: http://svn.apache.org/viewvc?rev=756199&view=rev
Log:
NUTCH-725

Modified:
lucene/nutch/trunk/NOTICE.txt

Modified: lucene/nutch/trunk/NOTICE.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/NOTICE.txt?rev=756199&r1=756198&r2=756199&view=diff
==
--- lucene/nutch/trunk/NOTICE.txt (original)
+++ lucene/nutch/trunk/NOTICE.txt Thu Mar 19 21:10:28 2009
@@ -71,4 +71,5 @@
 Nutch includes Automaton:
 This package is Copyright © 2001-2008 Anders Møller. All rights reserved.
 
- 
+Nutch includes Rome:
+Copyright 2004 Sun Microsystems, Inc. 




svn commit: r756198 [1/2] - /lucene/nutch/trunk/LICENSE.txt

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 21:09:56 2009
New Revision: 756198

URL: http://svn.apache.org/viewvc?rev=756198&view=rev
Log:
NUTCH-723

Modified:
lucene/nutch/trunk/LICENSE.txt



svn commit: r756192 - in /lucene/nutch/trunk/src/plugin/response-json/lib: ezmorph-1.0.6.LICENSE.txt json-lib-2.2.2-jdk15.LICENSE.txt

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 20:53:42 2009
New Revision: 756192

URL: http://svn.apache.org/viewvc?rev=756192&view=rev
Log:
record licenses

Added:
lucene/nutch/trunk/src/plugin/response-json/lib/ezmorph-1.0.6.LICENSE.txt

lucene/nutch/trunk/src/plugin/response-json/lib/json-lib-2.2.2-jdk15.LICENSE.txt

Added: lucene/nutch/trunk/src/plugin/response-json/lib/ezmorph-1.0.6.LICENSE.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/response-json/lib/ezmorph-1.0.6.LICENSE.txt?rev=756192&view=auto
==
--- lucene/nutch/trunk/src/plugin/response-json/lib/ezmorph-1.0.6.LICENSE.txt 
(added)
+++ lucene/nutch/trunk/src/plugin/response-json/lib/ezmorph-1.0.6.LICENSE.txt 
Thu Mar 19 20:53:42 2009
@@ -0,0 +1,201 @@
+  Apache License
+   Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+  "License" shall mean the terms and conditions for use, reproduction,
+  and distribution as defined by Sections 1 through 9 of this document.
+
+  "Licensor" shall mean the copyright owner or entity authorized by
+  the copyright owner that is granting the License.
+
+  "Legal Entity" shall mean the union of the acting entity and all
+  other entities that control, are controlled by, or are under common
+  control with that entity. For the purposes of this definition,
+  "control" means (i) the power, direct or indirect, to cause the
+  direction or management of such entity, whether by contract or
+  otherwise, or (ii) ownership of fifty percent (50%) or more of the
+  outstanding shares, or (iii) beneficial ownership of such entity.
+
+  "You" (or "Your") shall mean an individual or Legal Entity
+  exercising permissions granted by this License.
+
+  "Source" form shall mean the preferred form for making modifications,
+  including but not limited to software source code, documentation
+  source, and configuration files.
+
+  "Object" form shall mean any form resulting from mechanical
+  transformation or translation of a Source form, including but
+  not limited to compiled object code, generated documentation,
+  and conversions to other media types.
+
+  "Work" shall mean the work of authorship, whether in Source or
+  Object form, made available under the License, as indicated by a
+  copyright notice that is included in or attached to the work
+  (an example is provided in the Appendix below).
+
+  "Derivative Works" shall mean any work, whether in Source or Object
+  form, that is based on (or derived from) the Work and for which the
+  editorial revisions, annotations, elaborations, or other modifications
+  represent, as a whole, an original work of authorship. For the purposes
+  of this License, Derivative Works shall not include works that remain
+  separable from, or merely link (or bind by name) to the interfaces of,
+  the Work and Derivative Works thereof.
+
+  "Contribution" shall mean any work of authorship, including
+  the original version of the Work and any modifications or additions
+  to that Work or Derivative Works thereof, that is intentionally
+  submitted to Licensor for inclusion in the Work by the copyright owner
+  or by an individual or Legal Entity authorized to submit on behalf of
+  the copyright owner. For the purposes of this definition, "submitted"
+  means any form of electronic, verbal, or written communication sent
+  to the Licensor or its representatives, including but not limited to
+  communication on electronic mailing lists, source code control systems,
+  and issue tracking systems that are managed by, or on behalf of, the
+  Licensor for the purpose of discussing and improving the Work, but
+  excluding communication that is conspicuously marked or otherwise
+  designated in writing by the copyright owner as "Not a Contribution."
+
+  "Contributor" shall mean Licensor and any individual or Legal Entity
+  on behalf of whom a Contribution has been received by Licensor and
+  subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+  this License, each Contributor hereby grants to You a perpetual,
+  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+  copyright license to reproduce, prepare Derivative Works of,
+  publicly display, publicly perform, sublicense, and distribute the
+  Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the term

svn commit: r756182 - /lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 20:41:19 2009
New Revision: 756182

URL: http://svn.apache.org/viewvc?rev=756182&view=rev
Log:
record license

Added:
lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt

Added: 
lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt?rev=756182&view=auto
==
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt 
(added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt 
Thu Mar 19 20:41:19 2009
@@ -0,0 +1,24 @@
+Copyright (c) 2001-2004 Anders Moeller
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file




svn commit: r756181 - /lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 20:40:19 2009
New Revision: 756181

URL: http://svn.apache.org/viewvc?rev=756181&view=rev
Log:
record license

Added:
lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt

Added: 
lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt?rev=756181&view=auto
==
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt 
(added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt 
Thu Mar 19 20:40:19 2009
@@ -0,0 +1,17 @@
+dk.brics.automaton
+--
+
+Copyright (C) 2001-2004 Anders Moeller
+
+This source code in this package may be used under the terms of the
+BSD license.  Please read the file 'COPYING' for details.
+
+This package contains a full DFA/NFA implementation with Unicode
+alphabet and support for all standard regular expression operations.
+
+For more information, go to the package home page at
+http://www.brics.dk/~amoeller/automaton/
+
+
+Anders Moeller
+amoel...@brics.dk
\ No newline at end of file




svn commit: r756174 - /lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 20:32:15 2009
New Revision: 756174

URL: http://svn.apache.org/viewvc?rev=756174&view=rev
Log:
record license

Added:
lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt

Added: lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt?rev=756174&view=auto
==
--- lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt (added)
+++ lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt Thu Mar 19 
20:32:15 2009
@@ -0,0 +1,14 @@
+Copyright 2004 Sun Microsystems, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+




svn commit: r756171 - /lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 20:27:58 2009
New Revision: 756171

URL: http://svn.apache.org/viewvc?rev=756171&view=rev
Log:
record license

Added:
lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt

Added: lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt?rev=756171&view=auto
==
--- lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt (added)
+++ lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt Thu Mar 19 20:27:58 2009
@@ -0,0 +1,202 @@
+
+ Apache License
+   Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+  "License" shall mean the terms and conditions for use, reproduction,
+  and distribution as defined by Sections 1 through 9 of this document.
+
+  "Licensor" shall mean the copyright owner or entity authorized by
+  the copyright owner that is granting the License.
+
+  "Legal Entity" shall mean the union of the acting entity and all
+  other entities that control, are controlled by, or are under common
+  control with that entity. For the purposes of this definition,
+  "control" means (i) the power, direct or indirect, to cause the
+  direction or management of such entity, whether by contract or
+  otherwise, or (ii) ownership of fifty percent (50%) or more of the
+  outstanding shares, or (iii) beneficial ownership of such entity.
+
+  "You" (or "Your") shall mean an individual or Legal Entity
+  exercising permissions granted by this License.
+
+  "Source" form shall mean the preferred form for making modifications,
+  including but not limited to software source code, documentation
+  source, and configuration files.
+
+  "Object" form shall mean any form resulting from mechanical
+  transformation or translation of a Source form, including but
+  not limited to compiled object code, generated documentation,
+  and conversions to other media types.
+
+  "Work" shall mean the work of authorship, whether in Source or
+  Object form, made available under the License, as indicated by a
+  copyright notice that is included in or attached to the work
+  (an example is provided in the Appendix below).
+
+  "Derivative Works" shall mean any work, whether in Source or Object
+  form, that is based on (or derived from) the Work and for which the
+  editorial revisions, annotations, elaborations, or other modifications
+  represent, as a whole, an original work of authorship. For the purposes
+  of this License, Derivative Works shall not include works that remain
+  separable from, or merely link (or bind by name) to the interfaces of,
+  the Work and Derivative Works thereof.
+
+  "Contribution" shall mean any work of authorship, including
+  the original version of the Work and any modifications or additions
+  to that Work or Derivative Works thereof, that is intentionally
+  submitted to Licensor for inclusion in the Work by the copyright owner
+  or by an individual or Legal Entity authorized to submit on behalf of
+  the copyright owner. For the purposes of this definition, "submitted"
+  means any form of electronic, verbal, or written communication sent
+  to the Licensor or its representatives, including but not limited to
+  communication on electronic mailing lists, source code control systems,
+  and issue tracking systems that are managed by, or on behalf of, the
+  Licensor for the purpose of discussing and improving the Work, but
+  excluding communication that is conspicuously marked or otherwise
+  designated in writing by the copyright owner as "Not a Contribution."
+
+  "Contributor" shall mean Licensor and any individual or Legal Entity
+  on behalf of whom a Contribution has been received by Licensor and
+  subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+  this License, each Contributor hereby grants to You a perpetual,
+  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+  copyright license to reproduce, prepare Derivative Works of,
+  publicly display, publicly perform, sublicense, and distribute the
+  Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+  this License, each Contributor hereby grants to You a perpetual,
+  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+  (except as stated in this section) pa

svn commit: r756154 - /lucene/nutch/trunk/NOTICE.txt

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 19:48:03 2009
New Revision: 756154

URL: http://svn.apache.org/viewvc?rev=756154&view=rev
Log:
NUTCH-725

Modified:
lucene/nutch/trunk/NOTICE.txt

Modified: lucene/nutch/trunk/NOTICE.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/NOTICE.txt?rev=756154&r1=756153&r2=756154&view=diff
==
--- lucene/nutch/trunk/NOTICE.txt (original)
+++ lucene/nutch/trunk/NOTICE.txt Thu Mar 19 19:48:03 2009
@@ -1,2 +1,74 @@
+Apache Nutch
+Copyright 2009 The Apache Software Foundation
+
 This product includes software developed by The Apache Software
 Foundation (http://www.apache.org/).
+
+This product includes software developed by the following copyright owners:
+
+Nutch includes icu4j:
+Copyright (c) 1995-2006 International Business Machines Corporation and
+others
+
+Nutch includes Carrot2:
+Copyright (C) 2002-2006, Dawid Weiss, Stanis�aw Osi�ski.
+awid Weiss; Project administrator, various components, core; 2002; Poland
+Stanisław, Osiński; Lingo clustering component, ODP Input; 2003; Poland
+Karol Gołembniak, Irmina Masłowska; HAOG clustering component; 2006; Poznan 
University of Technology; Poland
+Michał, Wróblewski [*]; AHC clustering components; 2003; Poznan University 
of Technology, Poland
+Paweł, Kowalik [*]; Inductive search engine wrapper; 2003; Poznan University 
of Technology, Poland
+Steven, Schockaert [*]; Fuzzy Ants clustering component; 2004; University of 
Gent, Belgium
+Lang, Ngo Chi [*]; Fuzzy Rough set clustering component; 2004; Warsaw 
University, Poland
+
+Nutch includes Saxpath:
+Copyright (C) 2000-2002 werken digital. All rights reserved.
+
+Nutch includes jaxen:
+Copyright 2003-2006 The Werken Company. All Rights Reserved.
+
+Nutch includes Jdom:
+Copyright (C) 2000-2004 Jason Hunter & Brett McLaughlin.
+All rights reserved
+
+Nutch includes SaxPath:
+Copyright (C) 2000-2002 werken digital. All rights reserved.
+ 
+Nutch includes Snowball:
+Copyright (c) 2001, Dr Martin Porter
+(for the Java developments) Copyright (c) 2002, Richard Boulton. 
+
+Nutch includes ViolinStrings:
+Copyright (c) Michael Schmeling 1998, 2000 - All Rights Reserve
+
+Nutch includes Cyperneko:
+(C) Copyright 2002,2003, Andy Clark.  All rights reserved.
+
+Nutch includes Jena:
+(c) Copyright 2000, 2001, 2002, 2003, 2004 Hewlett-Packard Development 
Company, LP
+All rights reserved.
+
+Nutch includes BouncyCastle:
+Copyright (c) 2000 - 2008 The Legion Of The Bouncy Castle 
(http://www.bouncycastle.org)
+
+Nutch includes FontBox:
+Copyright (c) 2003-2005, www.fontbox.org
+
+Nutch includes JempBox:
+Copyright (c) 2006-2007, www.jempbox.org
+All rights reserved.
+
+Nutch includes PDFBox:
+Copyright (c) 2003-2005, www.pdfbox.org
+All rights reserved.
+
+Nutch includes JavaSWF:
+Copyright (c) 2001-2005, David N. Main, All rights reserved.
+
+Nutch includes Json Lib:
+This product includes software developed by Douglas Crockford 
+(http://www.crockford.com).
+
+Nutch includes Automaton:
+This package is Copyright © 2001-2008 Anders Møller. All rights reserved.
+
+ 




svn commit: r756149 - in /lucene/nutch/trunk/src/plugin/lib-xml/lib: jaxen.LICENSE jdom.LICENSE saxpath.LICENSE

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 19:33:11 2009
New Revision: 756149

URL: http://svn.apache.org/viewvc?rev=756149&view=rev
Log:
record licenses

Added:
lucene/nutch/trunk/src/plugin/lib-xml/lib/jaxen.LICENSE
lucene/nutch/trunk/src/plugin/lib-xml/lib/jdom.LICENSE
lucene/nutch/trunk/src/plugin/lib-xml/lib/saxpath.LICENSE

Added: lucene/nutch/trunk/src/plugin/lib-xml/lib/jaxen.LICENSE
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-xml/lib/jaxen.LICENSE?rev=756149&view=auto
==
--- lucene/nutch/trunk/src/plugin/lib-xml/lib/jaxen.LICENSE (added)
+++ lucene/nutch/trunk/src/plugin/lib-xml/lib/jaxen.LICENSE Thu Mar 19 19:33:11 
2009
@@ -0,0 +1,33 @@
+/*
+ $Id: LICENSE.txt 1128 2006-02-05 21:49:04Z elharo $
+
+ Copyright 2003-2006 The Werken Company. All Rights Reserved.
+ 
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+  * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+  * Neither the name of the Jaxen Project nor the names of its
+contributors may be used to endorse or promote products derived 
+from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ */

Added: lucene/nutch/trunk/src/plugin/lib-xml/lib/jdom.LICENSE
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-xml/lib/jdom.LICENSE?rev=756149&view=auto
==
--- lucene/nutch/trunk/src/plugin/lib-xml/lib/jdom.LICENSE (added)
+++ lucene/nutch/trunk/src/plugin/lib-xml/lib/jdom.LICENSE Thu Mar 19 19:33:11 
2009
@@ -0,0 +1,55 @@
+/*-- 
+
+ $Id: LICENSE.txt,v 1.11 2004/02/06 09:32:57 jhunter Exp $
+
+ Copyright (C) 2000-2004 Jason Hunter & Brett McLaughlin.
+ All rights reserved.
+ 
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 
+ 1. Redistributions of source code must retain the above copyright
+notice, this list of conditions, and the following disclaimer.
+ 
+ 2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions, and the disclaimer that follows 
+these conditions in the documentation and/or other materials 
+provided with the distribution.
+
+ 3. The name "JDOM" must not be used to endorse or promote products
+derived from this software without prior written permission.  For
+written permission, please contact .
+ 
+ 4. Products derived from this software may not be called "JDOM", nor
+may "JDOM" appear in their name, without prior written permission
+from the JDOM Project Management .
+ 
+ In addition, we request (but do not require) that you include in the 
+ end-user documentation provided with the redistribution and/or in the 
+ software itself an acknowledgement equivalent to the following:
+ "This product includes software developed by the
+  JDOM Project (http://www.jdom.org/)."
+ Alternatively, the acknowledgment may be graphical using the logos 
+ available at http://www.jdom.org/images/logos.
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED.  IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE

svn commit: r755994 - /lucene/nutch/trunk/README.txt

2009-03-19 Thread siren
Author: siren
Date: Thu Mar 19 13:47:32 2009
New Revision: 755994

URL: http://svn.apache.org/viewvc?rev=755994&view=rev
Log:
NUTCH-726

Modified:
lucene/nutch/trunk/README.txt

Modified: lucene/nutch/trunk/README.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/README.txt?rev=755994&r1=755993&r2=755994&view=diff
==
--- lucene/nutch/trunk/README.txt (original)
+++ lucene/nutch/trunk/README.txt Thu Mar 19 13:47:32 2009
@@ -1,4 +1,4 @@
-Nutch README
+Apache Nutch README
 
 Interesting files include:
 




svn commit: r752004 - /lucene/nutch/tags/release-1.0-rc1/

2009-03-10 Thread siren
Author: siren
Date: Tue Mar 10 07:15:13 2009
New Revision: 752004

URL: http://svn.apache.org/viewvc?rev=752004&view=rev
Log:
Nutch 1.0 rc1

Added:
lucene/nutch/tags/release-1.0-rc1/
  - copied from r752003, lucene/nutch/trunk/



svn commit: r752001 - /lucene/nutch/trunk/CHANGES.txt

2009-03-10 Thread siren
Author: siren
Date: Tue Mar 10 07:08:29 2009
New Revision: 752001

URL: http://svn.apache.org/viewvc?rev=752001&view=rev
Log:
prepare for release

Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=752001&r1=752000&r2=752001&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Mar 10 07:08:29 2009
@@ -1,6 +1,6 @@
 Nutch Change Log
 
-Release 1.0 - 2009-03-08
+Release 1.0 - 2009-03-10
 
  1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)
 




svn commit: r752000 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/util/DomUtil.java src/plugin/build.xml src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollecti

2009-03-10 Thread siren
Author: siren
Date: Tue Mar 10 07:07:22 2009
New Revision: 752000

URL: http://svn.apache.org/viewvc?rev=752000&view=rev
Log:
NUTCH-715 - Subcollection plugin doesn't work with default subcollections.xml 
file. Contributed by Dmitry Lihachev

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java
lucene/nutch/trunk/src/plugin/build.xml

lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=752000&r1=751999&r2=752000&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Mar 10 07:07:22 2009
@@ -378,6 +378,9 @@
 
 142. NUTCH-684 - Dedup support for Solr. (dogacan)
 
+143. NUTCH-715 - Subcollection plugin doesn't work with default
+ subcollections.xml file (Dmitry Lihachev via siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java?rev=752000&r1=751999&r2=752000&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java Tue Mar 10 
07:07:22 2009
@@ -60,7 +60,11 @@
   input = new InputSource(is);
   input.setEncoding("UTF-8");
   parser.parse(input);
-  element = (Element) parser.getDocument().getChildNodes().item(0);
+  int i = 0;
+  while (! (parser.getDocument().getChildNodes().item(i) instanceof 
Element)) {
+   i++;
+  } 
+  element = (Element)parser.getDocument().getChildNodes().item(i);
 } catch (FileNotFoundException e) {
   e.printStackTrace(LogUtil.getWarnStream(LOG));
 } catch (SAXException e) {

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=752000&r1=751999&r2=752000&view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Mar 10 07:07:22 2009
@@ -112,6 +112,7 @@
  
  
  
+ 
  
  
  

Modified: 
lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java?rev=752000&r1=751999&r2=752000&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
 Tue Mar 10 07:07:22 2009
@@ -49,6 +49,7 @@
   public void testInput(){
 StringBuffer xml=new StringBuffer();
 xml.append("");
+xml.append("");
 xml.append("");
 xml.append("");
 xml.append("nutch collection");




svn commit: r751480 - /lucene/nutch/tags/release-1.0-rc0/

2009-03-08 Thread siren
Author: siren
Date: Sun Mar  8 17:38:39 2009
New Revision: 751480

URL: http://svn.apache.org/viewvc?rev=751480&view=rev
Log:
Nutch 1.0 rc0

Added:
lucene/nutch/tags/release-1.0-rc0/
  - copied from r751479, lucene/nutch/trunk/



svn commit: r751475 - /lucene/nutch/trunk/CHANGES.txt

2009-03-08 Thread siren
Author: siren
Date: Sun Mar  8 17:30:52 2009
New Revision: 751475

URL: http://svn.apache.org/viewvc?rev=751475&view=rev
Log:
the version is indeed 1.0

Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=751475&r1=751474&r2=751475&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Mar  8 17:30:52 2009
@@ -1,6 +1,6 @@
 Nutch Change Log
 
-Release 0.9 - 2009-03-08
+Release 1.0 - 2009-03-08
 
  1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)
 




svn commit: r751471 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml default.properties

2009-03-08 Thread siren
Author: siren
Date: Sun Mar  8 17:20:59 2009
New Revision: 751471

URL: http://svn.apache.org/viewvc?rev=751471&view=rev
Log:
preparing for release

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/default.properties

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=751471&r1=751470&r2=751471&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Mar  8 17:20:59 2009
@@ -1,6 +1,6 @@
 Nutch Change Log
 
-Unreleased changes (1.0-dev)
+Release 0.9 - 2009-03-08
 
  1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)
 

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=751471&r1=751470&r2=751471&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sun Mar  8 17:20:59 2009
@@ -113,7 +113,7 @@
 
 
   http.agent.version
-  Nutch-1.0-dev
+  Nutch-1.0
   A version string to advertise in the User-Agent 
header.
 

Modified: lucene/nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/default.properties?rev=751471&r1=751470&r2=751471&view=diff
==
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Sun Mar  8 17:20:59 2009
@@ -1,6 +1,6 @@
 Name=Nutch
 name=nutch
-version=1.0-dev
+version=1.0
 final.name=${name}-${version}
 year=2006
 




svn commit: r749289 - in /lucene/nutch/trunk: ./ bin/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/test/org/apache/nutch/fetcher/

2009-03-02 Thread siren
Author: siren
Date: Mon Mar  2 12:28:22 2009
New Revision: 749289

URL: http://svn.apache.org/viewvc?rev=749289&view=rev
Log:
NUTCH-669 - Consolidate code for Fetcher and Fetcher2

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
  - copied, changed from r747319, 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
Removed:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/bin/nutch
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=749289&r1=749288&r2=749289&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar  2 12:28:22 2009
@@ -372,6 +372,8 @@
  
 139. NUTCH-700 - Neko1.9.11 goes into a loop (Julien Nioche, siren)
 
+140. NUTCH-669 - Consolidate code for Fetcher and Fetcher2 (siren)
+
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/bin/nutch
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?rev=749289&r1=749288&r2=749289&view=diff
==
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Mon Mar  2 12:28:22 2009
@@ -41,7 +41,6 @@
   echo "  generate  generate new segments to fetch from crawl db"
   echo "  freegen   generate new segments to fetch from text files"
   echo "  fetch fetch a segment's pages"
-  echo "  fetch2fetch a segment's pages using Fetcher2 
implementation"
   echo "  parse parse a segment's pages"
   echo "  readseg   read / dump segment data"
   echo "  mergesegs merge several segments, with optional filtering 
and slicing"

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=749289&r1=749288&r2=749289&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Mon Mar  2 
12:28:22 2009
@@ -24,7 +24,6 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
-import org.apache.nutch.fetcher.Fetcher;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
@@ -36,6 +35,8 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 
+import org.apache.nutch.fetcher.Fetcher;
+
 public class Crawl {
   public static final Log LOG = LogFactory.getLog(Crawl.class);
 
@@ -118,7 +119,7 @@
 LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
 break;
   }
-  fetcher.fetch(segment, threads);  // fetch it
+  fetcher.fetch(segment, threads, 
org.apache.nutch.fetcher.Fetcher.isParsing(conf));  // fetch it
   if (!Fetcher.isParsing(job)) {
 parseSegment.parse(segment);// parse it, if needed
   }

Copied: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (from 
r747319, lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java)
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?p2=lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java&p1=lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java&r1=747319&r2=749289&rev=749289&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Mar  
2 12:28:22 2009
@@ -1,9 +1,10 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE

svn commit: r749256 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar src/plugin/lib-nekohtml/lib/nekohtml-1.9.11.jar src/plugin/lib-nekohtml/plugin.xml

2009-03-02 Thread siren
Author: siren
Date: Mon Mar  2 10:16:51 2009
New Revision: 749256

URL: http://svn.apache.org/viewvc?rev=749256&view=rev
Log:
NUTCH-700 - revert to nekohtml-0.9.4

Added:
lucene/nutch/trunk/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar   (with 
props)
Removed:
lucene/nutch/trunk/src/plugin/lib-nekohtml/lib/nekohtml-1.9.11.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=749256&r1=749255&r2=749256&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar  2 10:16:51 2009
@@ -369,6 +369,8 @@
 
 138. NUTCH-419 - Unavailable robots.txt kills fetch (Carsten Lehmann,
  Doug Cook via ab)
+ 
+139. NUTCH-700 - Neko1.9.11 goes into a loop (Julien Nioche, siren)
 
 
 Release 0.9 - 2007-04-02

Added: lucene/nutch/trunk/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar?rev=749256&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar
--
svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml?rev=749256&r1=749255&r2=749256&view=diff
==
--- lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml Mon Mar  2 10:16:51 
2009
@@ -29,7 +29,7 @@
provider-name="org.cyberneko">
 

- 
+ 
 
  





svn commit: r748408 - in /lucene/nutch/trunk: CHANGES.txt conf/schema.xml

2009-02-26 Thread siren
Author: siren
Date: Fri Feb 27 06:21:37 2009
New Revision: 748408

URL: http://svn.apache.org/viewvc?rev=748408&view=rev
Log:
NUTCH-699 - Add an "official" solr schema for solr integration. Contributed by 
dogacan, Dmitry Lihachev

Added:
lucene/nutch/trunk/conf/schema.xml
Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=748408&r1=748407&r2=748408&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Feb 27 06:21:37 2009
@@ -361,6 +361,9 @@
 
 135. NUTCH-698 - CrawlDb is corrupted after a few crawl cycles (dogacan
  via siren)
+ 
+136. NUTCH-699 - Add an "official" solr schema for solr integration (dogacan,
+ Dmitry Lihachev via siren)
 
 Release 0.9 - 2007-04-02
 

Added: lucene/nutch/trunk/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/schema.xml?rev=748408&view=auto
==
--- lucene/nutch/trunk/conf/schema.xml (added)
+++ lucene/nutch/trunk/conf/schema.xml Fri Feb 27 06:21:37 2009
@@ -0,0 +1,109 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+id
+content
+
+
+
\ No newline at end of file




svn commit: r747324 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDatum.java src/java/org/apache/nutch/crawl/CrawlDbReducer.java

2009-02-24 Thread siren
Author: siren
Date: Tue Feb 24 10:09:36 2009
New Revision: 747324

URL: http://svn.apache.org/viewvc?rev=747324&view=rev
Log:
NUTCH-698 - CrawlDb is corrupted after a few crawl cycles, contributed by 
dogacan

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747324&r1=747323&r2=747324&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 10:09:36 2009
@@ -359,6 +359,9 @@
 
 134. NUTCH-247 - Robot parser to restrict (kubes, siren)
 
+135. NUTCH-698 - CrawlDb is corrupted after a few crawl cycles (dogacan
+ via siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=747324&r1=747323&r2=747324&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Tue Feb 
24 10:09:36 2009
@@ -204,7 +204,17 @@
   }
   
public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
- this.metaData = mapWritable;
+ this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable);
+   }
+   
+   /** Add all metadata from other CrawlDatum to this CrawlDatum.
+* 
+* @param other CrawlDatum
+*/
+   public void putAllMetaData(CrawlDatum other) {
+ for (Entry e : other.getMetaData().entrySet()) {
+   metaData.put(e.getKey(), e.getValue());
+ }
}
 
   /**

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=747324&r1=747323&r2=747324&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Tue 
Feb 24 10:09:36 2009
@@ -131,10 +131,10 @@
 if (oldSet) {
   // copy metadata from old, if exists
   if (old.getMetaData().size() > 0) {
-result.getMetaData().putAll(old.getMetaData());
+result.putAllMetaData(old);
 // overlay with new, if any
 if (fetch.getMetaData().size() > 0)
-  result.getMetaData().putAll(fetch.getMetaData());
+  result.putAllMetaData(fetch);
   }
   // set the most recent valid value of modifiedTime
   if (old.getModifiedTime() > 0 && fetch.getModifiedTime() == 0) {




svn commit: r747319 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/fetcher/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/test/ src/test/org/apache/nutch/fetcher/

2009-02-24 Thread siren
Author: siren
Date: Tue Feb 24 09:54:30 2009
New Revision: 747319

URL: http://svn.apache.org/viewvc?rev=747319&view=rev
Log:
NUTCH-247 - Robot parser to restrict, contributed by kubes

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java

lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
lucene/nutch/trunk/src/test/crawl-tests.xml
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747319&r1=747318&r2=747319&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 09:54:30 2009
@@ -357,6 +357,8 @@
 133. NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links
  set at cross domain redirects (Remco Verhoef, dogacan via siren)
 
+134. NUTCH-247 - Robot parser to restrict (kubes, siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=747319&r1=747318&r2=747319&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Tue Feb 
24 09:54:30 2009
@@ -933,6 +933,8 @@
   public void fetch(Path segment, int threads, boolean parsing)
 throws IOException {
 
+checkConfiguration();
+
 if (LOG.isInfoEnabled()) {
   LOG.info("Fetcher: starting");
   LOG.info("Fetcher: segment: " + segment);
@@ -995,4 +997,40 @@
 fetcher.fetch(segment, threads, parsing);  // run the Fetcher
 
   }
+
+  private void checkConfiguration() {
+
+// ensure that a value has been set for the agent name and that that
+// agent name is the first value in the agents we advertise for robot
+// rules parsing
+String agentName = getConf().get("http.agent.name");
+if (agentName == null || agentName.trim().length() == 0) {
+  String message = "Fetcher: No agents listed in 'http.agent.name'"
+  + " property.";
+  if (LOG.isFatalEnabled()) {
+LOG.fatal(message);
+  }
+  throw new IllegalArgumentException(message);
+} else {
+
+  // get all of the agents that we advertise
+  String agentNames = getConf().get("http.robots.agents");
+  StringTokenizer tok = new StringTokenizer(agentNames, ",");
+  ArrayList agents = new ArrayList();
+  while (tok.hasMoreTokens()) {
+agents.add(tok.nextToken().trim());
+  }
+
+  // if the first one is not equal to our agent name, log fatal and throw
+  // an exception
+  if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
+String message = "Fetcher: Your 'http.agent.name' value should be "
++ "listed first in 'http.robots.agents' property.";
+if (LOG.isWarnEnabled()) {
+  LOG.warn(message);
+}
+  }
+}
+  }
+
 }

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=747319&r1=747318&r2=747319&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 Tue Feb 24 09:54:30 2009
@@ -223,9 +223,6 @@
 // Grab the agent names we advertise to robots files.
 //
 String agentName = conf.get("http.agent.name");
-if (null == agentName) {
-  throw new RuntimeException("Agent name not configured!");
-}
 String agentNames = conf.get("http.robots.agents");
 StringTokenizer tok = new StringTokenizer(agentNames, ",");
 ArrayList agents = new ArrayList();
@@ -233,23 +230,6 @@
   agents.add(tok.nextToken().trim());
 }
 
-//
-// If there are no agents for robots-parsing, use our
-// default agent-string.  If both are present, our agent-string
-// should be the first one we advertise to robots-parsing.
-//
-if (agents.size() == 0) {
-  agents.add(agentName);
-  if (LOG.isFatalEnabled()) {
-LOG.fata

svn commit: r747312 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher2.java

2009-02-24 Thread siren
Author: siren
Date: Tue Feb 24 09:18:03 2009
New Revision: 747312

URL: http://svn.apache.org/viewvc?rev=747312&view=rev
Log:
NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links set at 
cross domain redirects, contributed by Remco Verhoef, dogacan

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747312&r1=747311&r2=747312&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 09:18:03 2009
@@ -349,11 +349,14 @@
 130. NUTCH-563 - Include custom fields in BasicQueryFilter
  (Julien Nioche via siren)
  
-131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin
+131. NUTCH-695 - Incorrect mime type detection by MoreIndexingFilter plugin
  (Dmitry Lihachev via siren)
  
 132. NUTCH-694 - Distributed Search Server fails (siren)
 
+133. NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links
+ set at cross domain redirects (Remco Verhoef, dogacan via siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=747312&r1=747311&r2=747312&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Tue Feb 
24 09:18:03 2009
@@ -94,7 +94,6 @@
   throws IOException {
   FileStatus[] files = listStatus(job);
   FileSplit[] splits = new FileSplit[files.length];
-  FileSystem fs = FileSystem.get(job);
   for (int i = 0; i < files.length; i++) {
 FileStatus cur = files[i];
 splits[i] = new FileSplit(cur.getPath(), 0,
@@ -443,6 +442,7 @@
 private String reprUrl;
 private boolean redirecting;
 private int redirectCount;
+private boolean ignoreExternalLinks;
 
 public FetcherThread(Configuration conf) {
   this.setDaemon(true);   // don't hang JVM on exit
@@ -457,6 +457,8 @@
   // backward-compatible default setting
   this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
   this.maxRedirect = conf.getInt("http.redirect.max", 3);
+  this.ignoreExternalLinks = 
+conf.getBoolean("db.ignore.external.links", false);
 }
 
 public void run() {
@@ -673,6 +675,22 @@
 throws MalformedURLException, URLFilterException {
   newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
   newUrl = urlFilters.filter(newUrl);
+  
+  if (ignoreExternalLinks) {
+try {
+  String origHost = new URL(urlString).getHost().toLowerCase();
+  String newHost = new URL(newUrl).getHost().toLowerCase();
+  if (!origHost.equals(newHost)) {
+if (LOG.isDebugEnabled()) {
+  LOG.debug(" - ignoring redirect " + redirType + " from " +
+  urlString + " to " + newUrl +
+  " because external links are ignored");
+}
+return null;
+  }
+} catch (MalformedURLException e) { }
+  }
+  
   if (newUrl != null && !newUrl.equals(urlString)) {
 reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
 url = new Text(newUrl);




svn commit: r746900 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/searcher/NutchBean.java

2009-02-22 Thread siren
Author: siren
Date: Mon Feb 23 07:02:30 2009
New Revision: 746900

URL: http://svn.apache.org/viewvc?rev=746900&view=rev
Log:
NUTCH-694 - Distributed Search Server fails

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=746900&r1=746899&r2=746900&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Feb 23 07:02:30 2009
@@ -351,6 +351,8 @@
  
 131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin
  (Dmitry Lihachev via siren)
+ 
+132. NUTCH-694 - Distributed Search Server fails (siren)
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=746900&r1=746899&r2=746900&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Mon 
Feb 23 07:02:30 2009
@@ -48,13 +48,10 @@
 //LogFormatter.setShowThreadIDs(true);
 //  }
 
-  private String[] segmentNames;
-
   private SearchBean searchBean;
   private SegmentBean segmentBean;
   private final HitInlinks linkDb;
 
-
   /** BooleanQuery won't permit more than 32 required/prohibited clauses.  We
* don't want to use too many of those. */
   private static final int MAX_PROHIBITED_TERMS = 20;
@@ -149,8 +146,8 @@
 }
   }
 
-  public String[] getSegmentNames() {
-return segmentNames;
+  public String[] getSegmentNames() throws IOException {
+return segmentBean.getSegmentNames();
   }
 
   public Hits search(Query query, int numHits) throws IOException {
@@ -374,17 +371,23 @@
 
 final Configuration conf = NutchConfiguration.create();
 final NutchBean bean = new NutchBean(conf);
-final Query query = Query.parse(args[0], conf);
-final Hits hits = bean.search(query, 10);
-System.out.println("Total hits: " + hits.getTotal());
-final int length = (int)Math.min(hits.getTotal(), 10);
-final Hit[] show = hits.getHits(0, length);
-final HitDetails[] details = bean.getDetails(show);
-final Summary[] summaries = bean.getSummary(details, query);
+try {
+  final Query query = Query.parse(args[0], conf);
+  final Hits hits = bean.search(query, 10);
+  System.out.println("Total hits: " + hits.getTotal());
+  final int length = (int)Math.min(hits.getTotal(), 10);
+  final Hit[] show = hits.getHits(0, length);
+  final HitDetails[] details = bean.getDetails(show);
+  final Summary[] summaries = bean.getSummary(details, query);
 
-for (int i = 0; i < hits.getLength(); i++) {
-  System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]);
+  for (int i = 0; i < hits.getLength(); i++) {
+System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]);
+  }
+} catch (Throwable t) {
+   LOG.error("Exception occured while executing search: " + t, t);
+   System.exit(1);
 }
+System.exit(0);
   }
 
   public long getProtocolVersion(String className, long clientVersion)
@@ -394,7 +397,7 @@
 
   final RPCSearchBean rpcBean = (RPCSearchBean)searchBean;
   return rpcBean.getProtocolVersion(className, clientVersion);
-} else if (SegmentBean.class.getName().equals(className) &&
+} else if (RPCSegmentBean.class.getName().equals(className) &&
segmentBean instanceof RPCSegmentBean) {
 
   final RPCSegmentBean rpcBean = (RPCSegmentBean)segmentBean;




svn commit: r745808 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java src/plugin/index-more/src/test/org/apache/nutch/indexer/m

2009-02-19 Thread siren
Author: siren
Date: Thu Feb 19 10:25:47 2009
New Revision: 745808

URL: http://svn.apache.org/viewvc?rev=745808&view=rev
Log:
NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin, 
contributed by Dmitry Lihachev

Modified:
lucene/nutch/trunk/CHANGES.txt

lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745808&r1=745807&r2=745808&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Feb 19 10:25:47 2009
@@ -348,6 +348,9 @@
 
 130. NUTCH-563 - Include custom fields in BasicQueryFilter
  (Julien Nioche via siren)
+ 
+131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin
+ (Dmitry Lihachev via siren)
 
 Release 0.9 - 2007-04-02
 

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=745808&r1=745807&r2=745808&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Thu Feb 19 10:25:47 2009
@@ -199,20 +199,20 @@
 MimeType mimeType = null;
 String contentType = data.getMeta(Response.CONTENT_TYPE);
 if (contentType == null) {
-// Note by Jerome Charron on 20050415:
-// Content Type not solved by a previous plugin
-// Or unable to solve it... Trying to find it
-// Should be better to use the doc content too
-// (using MimeTypes.getMimeType(byte[], String), but I don't know
-// which field it is?
-// if (MAGIC) {
-//   contentType = MIME.getMimeType(url, content);
-// } else {
-//   contentType = MIME.getMimeType(url);
-// }
-mimeType = MIME.getMimeType(url);
+  // Note by Jerome Charron on 20050415:
+  // Content Type not solved by a previous plugin
+  // Or unable to solve it... Trying to find it
+  // Should be better to use the doc content too
+  // (using MimeTypes.getMimeType(byte[], String), but I don't know
+  // which field it is?
+  // if (MAGIC) {
+  //   contentType = MIME.getMimeType(url, content);
+  // } else {
+  //   contentType = MIME.getMimeType(url);
+  // }
+  mimeType = MIME.getMimeType(url);
 } else {
-mimeType = MIME.forName(contentType);
+  mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
 }
 
 // Checks if we solved the content-type.

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=745808&r1=745807&r2=745808&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 Thu Feb 19 10:25:47 2009
@@ -16,10 +16,30 @@
  */
 package org.apache.nutch.indexer.more;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
 import junit.framework.TestCase;
 
 public class TestMoreIndexingFilter extends TestCase {
 
+  public void testContentType() throws IndexingException {
+Configuration conf = NutchConfiguration.create();
+assertContentType(conf, "text/html", "text/html");
+assertContentType(conf, "text/html; charset=UTF-8", "text/html");
+  }
+  
   public void testGetParts() {
 String[] parts = MoreIndexingFilter.getParts("text/html");
 assertParts(parts, 2, "text&

svn commit: r745517 - /lucene/nutch/trunk/contrib/web2/

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 14:03:18 2009
New Revision: 745517

URL: http://svn.apache.org/viewvc?rev=745517&view=rev
Log:
remove web2 as agreed on nutch-dev

Removed:
lucene/nutch/trunk/contrib/web2/



svn commit: r745503 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 12:53:12 2009
New Revision: 745503

URL: http://svn.apache.org/viewvc?rev=745503&view=rev
Log:
NUTCH-563 Include custom fields in BasicQueryFilter, contributed by Julien 
Nioche

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml

lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745503&r1=745502&r2=745503&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:53:12 2009
@@ -346,6 +346,9 @@
 129. NUTCH-691 - Update jakarta poi jars to the most relevant version
  (Dmitry Lihachev via siren)
 
+130. NUTCH-563 - Include custom fields in BasicQueryFilter
+ (Julien Nioche via siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=745503&r1=745502&r2=745503&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb 18 12:53:12 2009
@@ -1119,6 +1119,15 @@
   
 
 
+
+
 
 
 

Modified: 
lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java?rev=745503&r1=745502&r2=745503&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
 Wed Feb 18 12:53:12 2009
@@ -22,6 +22,13 @@
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.TermQuery;
 
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 import org.apache.nutch.analysis.NutchDocumentAnalyzer;
 import org.apache.nutch.analysis.CommonGrams;
 
@@ -31,7 +38,12 @@
 import org.apache.hadoop.conf.Configuration;
 
 /** The default query filter.  Query terms in the default query field are
- * expanded to search the url, anchor and content document fields.*/
+ * expanded to search the url, anchor and content document fields.
+ * Additional fields can be added by specifying parameters of the form : 
query.basic.(fieldname).boost
+ * to the configuration files (see nutch-default.xml for an example).Such 
fields will be used in the clauses
+ * generated by the BasicQueryFilter e.g. for a user query A B, it generates 
+(field1:A field2:A ...) +(field1:B field2:B).
+ * If you don't want the additional fields to be included in the clauses you 
will need to implement a custom query filter for it.
+ **/
 public class BasicQueryFilter implements QueryFilter {
 
   private static final int  URL_BOOST   = 0;
@@ -44,7 +56,7 @@
 
   private float PHRASE_BOOST;
 
-  private static final String[] FIELDS =
+  private String[] FIELDS =
   { "url", "anchor", "content", "title", "host" };
 
   private float[] FIELD_BOOSTS = new float[5];
@@ -177,9 +189,51 @@
 this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f);
 this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f);
 this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f);
+findAdditionalFields(conf);
   }
 
   public Configuration getConf() {
 return this.conf;
   }
+  
+  /** Searches for parameters of the form : query.basic.(fieldname).boost
+   * and adds the fielname to the list of default fields.
+   **/
+  private void findAdditionalFields(Configuration conf) {
+// get additional fields specified in parameters
+Pattern pat = Pattern.compile("query\\.basic\\.(.+)\\.boost");
+Iterator confEntriesIterator = conf.iterator(); 
+List existingFields = java.util.Arrays.asList(FIELDS);  
+ArrayList tempfieldNames = new ArrayList();
+ArrayList tempfieldBoosts = new ArrayList();
+while (confEntriesIterator.hasNext()){
+  Map.Entry entry = (Map.Entry) confEntriesIterator.next();
+  String key = entry.getKey().toString();
+  Matcher match = pat.matcher(key);
+  if (!match.matches())continue;
+  String fieldName = match.group(1);
+  if (fieldName!=null){
+// check whether it matches one of t

svn commit: r745499 - in /lucene/nutch/trunk: ./ src/plugin/lib-jakarta-poi/ src/plugin/lib-jakarta-poi/lib/ src/plugin/parse-msword/ src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ sr

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 12:43:04 2009
New Revision: 745499

URL: http://svn.apache.org/viewvc?rev=745499&view=rev
Log:
NUTCH-691 - Update jakarta poi jars to the most relevant version, contributed 
by Dmitry Lihachev

Added:

lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar   
(with props)

lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar
   (with props)
Removed:

lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar

lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml
lucene/nutch/trunk/src/plugin/parse-msword/build.xml

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java

lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745499&r1=745498&r2=745499&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:43:04 2009
@@ -343,6 +343,9 @@
 128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException
  (Stefan Will, siren)
  
+129. NUTCH-691 - Update jakarta poi jars to the most relevant version
+ (Dmitry Lihachev via siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Added: 
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar?rev=745499&view=auto
==
Binary file - no diff available.

Propchange: 
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar
--
svn:mime-type = application/octet-stream

Added: 
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar?rev=745499&view=auto
==
Binary file - no diff available.

Propchange: 
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar
--
svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml?rev=745499&r1=745498&r2=745499&view=diff
==
--- lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml Wed Feb 18 
12:43:04 2009
@@ -29,10 +29,10 @@
provider-name="jakarta.apache.org">
 

- 
+ 
 
  
- 
+ 
 
  


Modified: lucene/nutch/trunk/src/plugin/parse-msword/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/build.xml?rev=745499&r1=745498&r2=745499&view=diff
==
--- lucene/nutch/trunk/src/plugin/parse-msword/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/build.xml Wed Feb 18 12:43:04 
2009
@@ -44,7 +44,8 @@
 
   
   
-  
-  
+  
+
+  
 
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java?rev=745499&r1=745498&r2=745499&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
 Wed Feb 18 12:43:04 2009
@@ -53,8 +53,9 @@
 int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
 
 // get a list of character properties
+
 Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, 
chpTableOffset,
-  chpTableSize, fcMi

svn commit: r745448 - /lucene/nutch/trunk/build.xml

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 09:18:07 2009
New Revision: 745448

URL: http://svn.apache.org/viewvc?rev=745448&view=rev
Log:
NUTCH-687 add RAT, also check plugins

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=745448&r1=745447&r2=745448&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Wed Feb 18 09:18:07 2009
@@ -624,7 +624,9 @@
   
 
-  
+  
+   
+   
   
 
   




svn commit: r745446 - in /lucene/nutch/trunk/src: java/org/apache/nutch/util/ plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/ plugin/field-boost/src/java/org/apache/nutch/indexer/fie

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 09:14:29 2009
New Revision: 745446

URL: http://svn.apache.org/viewvc?rev=745446&view=rev
Log:
NUTCH-688 add missing headers, part 2 rest

Modified:

lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java

lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java

lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java

lucene/nutch/trunk/src/plugin/response-json/src/java/org/apache/nutch/searcher/response/json/JSONResponseWriter.java

lucene/nutch/trunk/src/plugin/response-xml/src/java/org/apache/nutch/searcher/response/xml/XMLResponseWriter.java

lucene/nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java

lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java

lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?rev=745446&r1=745445&r2=745446&view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
 (original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
 Wed Feb 18 09:14:29 2009
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.util;
 
 import java.io.DataInput;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=745446&r1=745445&r2=745446&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Wed Feb 
18 09:14:29 2009
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.util;
 
 import java.util.Stack;

Modified: 
lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java?rev=745446&r1=745445&r2=745446&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
 Wed Feb 18 09:14:29 2009
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the A

svn commit: r745416 - /lucene/nutch/trunk/build.xml

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 08:11:46 2009
New Revision: 745416

URL: http://svn.apache.org/viewvc?rev=745416&view=rev
Log:
NUTCH-687 add RAT

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=745416&r1=745415&r2=745416&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Wed Feb 18 08:11:46 2009
@@ -610,4 +610,23 @@
 
   
 
+  
+  
+  
+  
+
+  
+
+  
+
+  
+
+  
+
+  
+  
+
+  
+   
 




svn commit: r745096 - in /lucene/nutch/trunk: ./ src/plugin/ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/ src/plugin/index-more/src/test/ src/plugin/index-more/src/test/org/ src/plugi

2009-02-17 Thread siren
Author: siren
Date: Tue Feb 17 14:28:14 2009
New Revision: 745096

URL: http://svn.apache.org/viewvc?rev=745096&view=rev
Log:
fix NUTCH-631 - thanks to Stefan Will

Added:
lucene/nutch/trunk/src/plugin/index-more/src/test/
lucene/nutch/trunk/src/plugin/index-more/src/test/org/
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/

lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/

lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/build.xml

lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745096&r1=745095&r2=745096&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb 17 14:28:14 2009
@@ -339,6 +339,9 @@
  (Curtis d'Entremont, ab)
 
 127. NUTCH-683 - NUTCH-676 broke CrawlDbMerger. (dogacan)
+
+128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException
+     (Stefan Will, siren)
  
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=745096&r1=745095&r2=745096&view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Feb 17 14:28:14 2009
@@ -93,6 +93,7 @@
   
 
  
+ 
  
  
  

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=745096&r1=745095&r2=745096&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Tue Feb 17 14:28:14 2009
@@ -175,12 +175,31 @@
 return doc;
   }
 
-  // Add Content-Type and its primaryType and subType
+  /**
+   * 
+   * Add Content-Type and its primaryType and subType add contentType,
+   * primaryType and subType to field "type" as un-stored, indexed and
+   * un-tokenized, so that search results can be confined by contentType or its
+   * primaryType or its subType.
+   * 
+   * 
+   * For example, if contentType is application/vnd.ms-powerpoint, search can 
be
+   * done with one of the following qualifiers
+   * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
+   * all case insensitive. The query filter is implemented in
+   * {...@link TypeQueryFilter}.
+   * 
+   * 
+   * @param doc
+   * @param data
+   * @param url
+   * @return
+   */
   private NutchDocument addType(NutchDocument doc, ParseData data, String url) 
{
 MimeType mimeType = null;
 String contentType = data.getMeta(Response.CONTENT_TYPE);
 if (contentType == null) {
-   // Note by Jerome Charron on 20050415:
+// Note by Jerome Charron on 20050415:
 // Content Type not solved by a previous plugin
 // Or unable to solve it... Trying to find it
 // Should be better to use the doc content too
@@ -202,32 +221,31 @@
 }
 
 contentType = mimeType.getName();
-String primaryType = mimeType.getSuperType().getName();
-String subType = mimeType.getSubTypes().first().getName();
-// leave this for future improvement
-//MimeTypeParameterList parameterList = mimeType.getParameters()
-
-// add contentType, primaryType and subType to field "type"
-// as un-stored, indexed and un-tokenized, so that search results
-// can be confined by contentType or its primaryType or its subType.
-// For example, if contentType is application/vnd.ms-powerpoint,
-// search can be done with one of the following qualifiers
-// type:application/vnd.ms-powerpoint
-// type:application
-// type:vnd.ms-powerpoint
-// all case insensitive.
-// The query filter is implemented in TypeQueryFilter.java
+
 doc.add("type", contentType);
-doc.add("type", primaryType);
-doc.add("type", subType);
 
-// add its primaryType and subType to respective fields
-doc.add("primaryType", primaryType);
-doc.add("subType&

svn commit: r743573 - in /lucene/nutch/trunk: site/index.html site/index.pdf src/site/src/documentation/content/xdocs/index.xml

2009-02-11 Thread siren
Author: siren
Date: Wed Feb 11 23:48:50 2009
New Revision: 743573

URL: http://svn.apache.org/viewvc?rev=743573&view=rev
Log:
fix link and name

Modified:
lucene/nutch/trunk/site/index.html
lucene/nutch/trunk/site/index.pdf
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/index.xml

Modified: lucene/nutch/trunk/site/index.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/index.html?rev=743573&r1=743572&r2=743573&view=diff
==
--- lucene/nutch/trunk/site/index.html (original)
+++ lucene/nutch/trunk/site/index.html Wed Feb 11 23:48:50 2009
@@ -261,7 +261,7 @@

 
Lucene will be extremely well represented at
-   http://us.apachecon.com/c/acus2008/";>ApacheCon 
US 2009
+   http://www.eu.apachecon.com/c/aceu2009/";>ApacheCon EU 2009
in Amsterdam, Netherlands this March 23-27, 2009:

 

Modified: lucene/nutch/trunk/site/index.pdf
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/index.pdf?rev=743573&r1=743572&r2=743573&view=diff
==
--- lucene/nutch/trunk/site/index.pdf (original)
+++ lucene/nutch/trunk/site/index.pdf Wed Feb 11 23:48:50 2009
@@ -157,10 +157,10 @@
 >>
 endobj
 32 0 obj
-<< /Length 3367 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 3366 /Filter [ /ASCII85Decode /FlateDecode ]
  >>
 stream
-Gau`V9lo)J'#!u4...@lb,,1N,Y4P.EbrdqF_Wj5.Yl7i3&?YNjV`)*E\.+FDhB->'-JunI+ZkY#bg.&!#po8&3"f7YMY7gnrNuupK&Vb\=s"T58n&8g(GH\1eNA3A+6*,2#K(q...@bsi`gs.r##>j\GPCQh;-Ug@&5D=>kM,0&(;d1Gh;[)cdFM)Z_3W#k3Ci*2Zm^BF3aS0$\_,A50QR'l1]P8WR#kgD/*J^s8UXAo:P3J/EQMo\'sg3;nh)2[oGFG]fXir8\/jKeFWOh['JQM8,"hOQj+Ige%i7?9[P-X"tU6U56K^p(:,\)\DUcsQJ:HMl]4OSr)A[I(.5ijH/f4D5q;_V1Pbm1*"LHH#5q&@D/4E]#;hR(&pLM?WKrMp>Ig5!iABV%7I]&ei$opbQuN\T@,)qc##Q"gEQ3+D+Wt23Mr6>"m0>Hbf+3ERNs7lp68*)Af\fWNWLqju854ms$8\!Z+gb]qMBr4NldBO`.\ajWA4%UH"K,e,Sj)9/G)o[1oSSKlo9K)_i-G9m3n?S"nL>J;s(![(]EAR:VL#S1GOn%#t?51lN8YTiRX,[;YZsBu+?uK\RU_ogQL/20-FOC3nOo-j6u$?BdnS0XWaCBo8pq,R4cZR2u=WmD0;d_linEu8(>k'5o9e+&])6U(X6,9ZTj_fmr.7[;^qJG(KEtSRW;8mu6)^70_=Ocg1QdohLV"v5...@`!q.7jl>gseijh&,!4?`Y5\$-S4k;Xbnj-1u!U\.A2Z7\(5S!$s3WG[!Y;l1th^>=:[bH7eq*#mq.;/+,L)Jk5RJ7+%a:pmhWVN#RL`JQmG48IDIf1g3Q(]3H5LtIAO8=6UrMR=m6HmcE#ZM<`:)jb.=?4fYU65q+(+...@n&@X<.*_ip%a$T7h(6np%W.^ecL(FOlXA/K/oT>5VR.T'n\+geWq1VG0Z>p^fpA9jjWGrV(#...@%0mlmvdzr!x#<@Lo?t%^k,?(Y;'1^-...@ldm-\e3lcth]=kj89=;H*n0;]XNn_Y2^8dZ'UB)ZB^24nK:J+7V1R0`oLhIC`5QT,2eX3$K%CJOm&Zr+IXB%k=8bj*XF4b'3Ys%!C0%mIi`W%o43_8f@@n:;9VF/>c...@=l.g#em!n/$^R.]Wh"s,sb/^O]]X/&WHpF:,cfaWs9;a&":f...@pcro,to-]Uhm/6-=QnT\c)+b#/!)q>*Qk[B.#2s*...@0nbhfn-n\x*g"\n=Oj+n(u-nTBktH:1hh^?!6Vh]-0_?QecCbjL0-\jgSjU53>O0W2Y,y7g...@gopcnj6a]b[phx*$-q>230=&lc]56f-/[i0#F%.)9Mm'ep?[hj...@p(2iFGdfKdL)J!Th%4>QA/XYoKV3EAbErV25NE\Bg9ks$UQk&u\OTP%c/bYE\->S^dHnmqM_H3m&)6_-FZAM0hJ-V-SW
 
Q%eiR-,&FY4\=m]]CI;nFOK7fCpAFY+^;O_PD"]6b9&bGrpi;:&-&U"a...@%q\f(5g)Cqf+VN>b7...@d1v:Y7UQ.,6ZBj0n
+Gau`V9lo)J'#!u4...@lb,,i...@r,*EbrdqF_Wj5.Yl7i3&?YNjV`)*E\.+FDhB->'-JunI+ZkY#bg.&!#po8&3"f7YMY7gYV6FPm]WACiJIaMr/i+[TIjSl8E(o)3sP_16,F^r=-TcZ(Ve,5k/ZDp_;-bg...@x!\7jximsae=duo-^q.7`=i+[rc[o=)S#_$Gh$ffncT1/51__,Sq(QiMVKdiIDhLP/Nl=Zde76(g&^]Y#YYhU;'?...@p)#i2:=[2\,d%k)C\V#)VR,boH:aEB$Wf)tY6J[B8Of*md;=9&-tCVcl!aCuoOuL!u-1'u=f1Ju^k'_i0-`$KDo<0o-Janr2k"6P+r#(VsU#gCIsOXrd^.*Y!]jKO3.`Z,*3&;a7StOdZh1$[J2P+gb]qMBot6h5/YP:dX#>Vb_TOh4p5O^S9`LHmQKEG[A-4Ncbn#]ZLH(5:P]SQU&@.<,W\\DZ0p[\X4YR`L9$(neRa=`/A'W)QRhFhr%E?'#+Je+u(7(/53py6eay68fk8q\^n.p$rh...@s=mi#'g...@$ge7qpxp2,7@/\!?cOP>D^^1]I87aCa19b4iUKtt.BMm=`QVNs\09i70/4jH+erW+m_G(7nN8JYq(>o%b8rY87SNgO^0VT[DdS,JBG[Q;njG(U8!A)c[\+:J@&i:+1B#=<2H9sIlo_]$=J9),B%#[p&q$%tUU6LAHDeN+\9Ufe4cq^ssHL%k6.G1\[D)'ttYn!R&$m1LlY`Sf-C[li+q,LK5,a8W>[lY:Y#G(XnTW&^i?On`M?&;^ER,7#O!M-GlJ*u
 
q<7i[H<7Dl_kD))O;+cmhi#rjl*=edto6?1br\=...@3,,jHFR!6a!iJ-f]#:OOP4IS&j7Q4J.Qg/'WP1$^q:aZpd&f\@,]H!Pbq)\KN;t*)2r...@#x8isuq`viao"?.H:h!ag#es8$yp9m-...@!+a]t?`e/eAH]=Rc"4oln[ys?ed3$h_fp\ciol^^pa?#hap...@ah==7;9MqC0^&*tZ>?.d...@bpdzf8^ffxgo5pi:7e`MUR2TEQI/,iz=jb$p12o-4...@e,gk84m0VL!2CK%>0:2]f;n^go%0$OU5,)ljr...@j5i%engqkk81ryn*$bbg%bV=eRrbpl59WeHb3gDo,\!d%#^[4jNaFnt64'B4%iI*3rH]MgSf,9c=_Fel]ZI&c#epE[6C8=!\i=h"9NC[tb1;hO#-b+r,*>m]WSYotk)JqB2d(j&WS[UcnRF/d...@xs2k3+rc%o!%_inrj=%q<_0.&3KedU"e[::HX39M"a+M-GX>Bh8u9IpR[+10$=nZ>lHh

svn commit: r743464 - in /lucene/nutch/trunk: site/index.html site/index.pdf src/site/src/documentation/content/xdocs/index.xml

2009-02-11 Thread siren
Author: siren
Date: Wed Feb 11 19:43:26 2009
New Revision: 743464

URL: http://svn.apache.org/viewvc?rev=743464&view=rev
Log:
add apachecon promo

Modified:
lucene/nutch/trunk/site/index.html
lucene/nutch/trunk/site/index.pdf
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/index.xml

Modified: lucene/nutch/trunk/site/index.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/index.html?rev=743464&r1=743463&r2=743464&view=diff
==
--- lucene/nutch/trunk/site/index.html (original)
+++ lucene/nutch/trunk/site/index.html Wed Feb 11 19:43:26 2009
@@ -209,6 +209,10 @@
 News
 
 
+09
 February 2009 - Lucene at ApacheCon Europe 2009 in
+   Amsterdam
+
+
 2 April 2007: Nutch 0.9 
Released
 
 
@@ -247,7 +251,50 @@
 
 News
 
-
+
+09 February 2009 - Lucene at ApacheCon Europe 2009 in
+   Amsterdam
+
+   
+http://www.eu.apachecon.com/c/aceu2009/"; title="ApacheCon EU 2009">
+   http://www.eu.apachecon.com/page_attachments//0115/125x125_basic.gif";>
+   
+
+   Lucene will be extremely well represented at
+   http://us.apachecon.com/c/acus2008/";>ApacheCon 
US 2009
+   in Amsterdam, Netherlands this March 23-27, 2009:
+   
+
+   
+
+   
+http://eu.apachecon.com/c/aceu2009/sessions/197";>Lucene Boot Camp
+   - A two day training session, March 23 & 
24th
+
+
+http://eu.apachecon.com/c/aceu2009/sessions/201";>Solr Boot Camp - 
A one day training session, March 24th
+
+
+http://eu.apachecon.com/c/aceu2009/sessions/136";>Introducing Apache 
Mahout - Grant Ingersoll. March 25th @ 10:30
+
+
+http://eu.apachecon.com/c/aceu2009/sessions/137";>Lucene/Solr Case 
Studies - Erik Hatcher. March 25th @ 11:30
+
+
+http://eu.apachecon.com/c/aceu2009/sessions/138";>Advanced Indexing 
Techniques with Apache Lucene - Michael Busch. March 25th @ 14:00  
+   
+
+http://eu.apachecon.com/c/aceu2009/sessions/251";>Apache Solr - A Case 
Study - Uri Boness. March 26th @ 17:30
+   
+
+http://eu.apachecon.com/c/aceu2009/sessions/250";>Best of breed - 
httpd, forrest, solr and droids - Thorsten Scherler. March 27th @ 17:30
+   
+
+http://eu.apachecon.com/c/aceu2009/sessions/165";>Apache Droids - an 
intelligent standalone robot framework - Thorsten Scherler. March 26th @ 
15:00
+
+   
+
+
 2 April 2007: Nutch 0.9 Released
 The 0.9 release of Nutch is now available. This is the second release of 
Nutch
   based entirely on the underlying Hadoop platform. This release includes 
several critical
@@ -256,41 +303,41 @@
   See http://www.apache.org/dist/lucene/nutch/CHANGES-0.9.txt";>
   list of changes  made in this version. The release is available
   http://lucene.apache.org/nutch/release/";>here.
-
+
 24 September 2006: Nutch 0.8.1 Released
 The 0.8.1 release of Nutch is now available. This is a maintenance release 
to 0.8 branch fixing many serous bugs found in version 0.8.
   See http://www.apache.org/dist/lucene/nutch/CHANGES-0.8.1.txt";>
   list of changes  made in this version. The release is available
   http://lucene.apache.org/nutch/release/";>here.
-
+
 25 July 2006: Nutch 0.8 Released
 The 0.8 release of Nutch is now available. This is the first release of 
Nutch based on
   hadoop architecure. See http://svn.apache.org/viewvc/lucene/nutch/tags/release-0.8/CHANGES.txt?view=markup";>
   CHANGES.txt for list of changes made in this version. The release is 
available
   http://lucene.apache.org/nutch/release/";>here.
-
+
 31 March 2006: Nutch 0.7.2 Released
 The 0.7.2 release of Nutch is now available. This is a bug fix release for 
0.7 branch. See
   http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=390158";>
   CHANGES.txt for details. The release is available
   http://lucene.apache.org/nutch/release/";>here.
-
+
 1 October 2005: Nutch 0.7.1 Released
 The 0.7.1 release of Nutch is now available. This is a bug fix release. See
   http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=292986";>
   CHANGES.txt for details. The release is available
   http://lucene.apache.org/nutch/release/";>here.
-
+
 17 August 2005: Nutch 0.7 Released
 This is the first Nutch release as an Apache Lucene sub-project. See 
   http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/CHANGES.txt?rev=233150";>
   CHANGES.txt for details. The release is available 
  

svn commit: r733014 - /lucene/nutch/trunk/site/doap.rdf

2009-01-09 Thread siren
Author: siren
Date: Fri Jan  9 03:57:01 2009
New Revision: 733014

URL: http://svn.apache.org/viewvc?rev=733014&view=rev
Log:
add missing releases

Modified:
lucene/nutch/trunk/site/doap.rdf

Modified: lucene/nutch/trunk/site/doap.rdf
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/doap.rdf?rev=733014&r1=733013&r2=733014&view=diff
==
--- lucene/nutch/trunk/site/doap.rdf (original)
+++ lucene/nutch/trunk/site/doap.rdf Fri Jan  9 03:57:01 2009
@@ -33,6 +33,30 @@
 http://wiki.apache.org/nutch/"/>
 
   
+branch-0.9
+nutch-0.9
+2007-04-01
+0.9
+  
+
+
+  
+branch-0.8
+nutch-0.8.1
+2006-09-24
+0.8.1
+  
+
+
+  
+branch-0.8
+nutch-0.8
+2006-06-25
+0.8
+  
+
+
+  
 branch-0.7
 nutch-0.7.2
 2006-03-31




svn commit: r613378 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/searcher/ src/java/org

2008-01-19 Thread siren
Author: siren
Date: Sat Jan 19 00:59:29 2008
New Revision: 613378

URL: http://svn.apache.org/viewvc?rev=613378&view=rev
Log:
NUTCH-580 Remove deprecated hadoop api calls (FS)

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java   (with 
props)
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=613378&r1=613377&r2=613378&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Jan 19 00:59:29 2008
@@ -191,6 +191,8 @@
 
 66. NUTCH-584 - urls missing from fetchlist (Ruslan Ermilov, ab)
 
+67. NUTCH-580 - Remove deprecated hadoop api calls (FS) (siren)
+
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=613378&r1=613377&r2=613378&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Sat Jan 19 
00:59:29 2008
@@ -32,6 +32,7 @@
 import org.apache.nutch.indexer.DeleteDuplicates;
 import org.apache.nutch.indexer.IndexMerger;
 import org.apache.nutch.indexer.Indexer;
+import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 
@@ -131,9 +132,9 @@
   linkDbTool.invert(linkDb, segments, true, true, false); // invert links
 
   // index, dedup & merge
-  indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments));
+  indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments, 
HadoopFSUtil.getPassAllFilter()));
   dedup.dedup(new Path[] { indexes });
-  merger.merge(fs.listPaths(indexes), index, tmpDir);
+  merger.merge(fs.listPaths(indexes, HadoopFSUtil.getPassAllFilter()), 
index, tmpDir);
 } else {
   LOG.warn("No URLs to fetch - check your seed list and URL filters.");
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=613378&r1=613377&r2=613378&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Sat Jan 19 
00:59:29 2008
@@ -31,6 +31,7 @@
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.ToolBase;
 
+import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.LockUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -181,15 +182,7 @@
   } else if (args[i].equals("-noAdditions")) {
 additionsAllowed = false;
   } else if (args[i].equals("-dir")) {
-Path[] paths = fs.listPaths(new Path(args[++i]), new PathFilter() {
-  public boolean accept(Path dir) {
-try {
-  return fs.isDirectory(dir);
-} catch (IOException ioe) {
-  return false;
-}
-  }
-});
+Path[] paths = fs.listPaths(new Path(args[++i]), 
HadoopFSUtil.getPassDirectoriesFilter(fs));
 dirs.addAll(Arrays.asList(paths));
   } else {
 dirs.add(new Path(args[i]));

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=613378&r1=613377&r2=613378&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Sat Ja

svn commit: r546998 - /lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java

2007-06-13 Thread siren
Author: siren
Date: Wed Jun 13 11:50:14 2007
New Revision: 546998

URL: http://svn.apache.org/viewvc?view=rev&rev=546998
Log:
remove debug iteration because it seems to block spelling suggestions 

Modified:

lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java

Modified: 
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java?view=diff&rev=546998&r1=546997&r2=546998
==
--- 
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java
 (original)
+++ 
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java
 Wed Jun 13 11:50:14 2007
@@ -150,14 +150,6 @@
 , originalTerm, ng1, ng2, maxr, bStart, bEnd,
 bTransposition, maxd, lis, true);
 
-Iterator it = lis.iterator();
-
-while (it.hasNext()) {
-  if(LOG.isDebugEnabled()){
-LOG.debug(it.next().toString());
-  }
-}
-
 if (suggestions.length > 0) {
   currentTerm.setSuggestedTerm(suggestions[0]);
 




svn commit: r538273 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java

2007-05-15 Thread siren
Author: siren
Date: Tue May 15 11:29:49 2007
New Revision: 538273

URL: http://svn.apache.org/viewvc?view=rev&rev=538273
Log:
NUTCH-161 Change Plain text parser to use parser.character.encoding.default 
property for fall back encoding
spotted by KuroSaka TeruHiko

Modified:
lucene/nutch/trunk/CHANGES.txt

lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=538273&r1=538272&r2=538273
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue May 15 11:29:49 2007
@@ -19,6 +19,10 @@
  
  7. NUTCH-483 - Remove redundant commons-logging jar from ontology plugin
 (siren)
+
+ 8. NUTCH-161 - Change Plain text parser to
+use parser.character.encoding.default property for fall back encoding
+(KuroSaka TeruHiko, siren)
   
 
 Release 0.9 - 2007-04-02

Modified: 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?view=diff&rev=538273&r1=538272&r2=538273
==
--- 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
 Tue May 15 11:29:49 2007
@@ -24,35 +24,42 @@
 import org.apache.hadoop.conf.Configuration;
 
 public class TextParser implements Parser {
+
   private Configuration conf;
+  
+  /**
+   * Encoding to be used when character set isn't specified
+   * as HTTP header.
+   */
+  private String defaultEncoding;
 
+  /**
+   * Parses plain text document. This code uses configured default encoding
+   * [EMAIL PROTECTED] parser.character.encoding.default} if character set 
isn't specified
+   * as HTTP header. FIXME: implement charset detector
+   */
   public ParseResult getParse(Content content) {
 
-// ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new
-// Outlink[0], metadata);
-
 String encoding = StringUtil.parseCharacterEncoding(content
 .getContentType());
 String text;
-if (encoding != null) { // found an encoding header
-  try { // try to use named encoding
-text = new String(content.getContent(), encoding);
-  } catch (java.io.UnsupportedEncodingException e) {
-return new ParseStatus(e).getEmptyParseResult(content.getUrl(), 
getConf());
-  }
-} else {
-  // FIXME: implement charset detector. This code causes problem when
-  // character set isn't specified in HTTP header.
-  text = new String(content.getContent()); // use default encoding
+try {
+  text = new String(content.getContent(), encoding != null ? encoding
+  : defaultEncoding);
+} catch (java.io.UnsupportedEncodingException e) {
+  return new ParseStatus(e)
+  .getEmptyParseResult(content.getUrl(), getConf());
 }
+
 ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "",
 OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata());
 parseData.setConf(this.conf);
 return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, 
parseData));
-
   }
 
   public void setConf(Configuration conf) {
+defaultEncoding = conf.get("parser.character.encoding.default",
+"windows-1252");
 this.conf = conf;
   }
 




svn commit: r537915 - /lucene/nutch/dist/KEYS

2007-05-14 Thread siren
Author: siren
Date: Mon May 14 09:58:17 2007
New Revision: 537915

URL: http://svn.apache.org/viewvc?view=rev&rev=537915
Log:
update my key

Modified:
lucene/nutch/dist/KEYS

Modified: lucene/nutch/dist/KEYS
URL: 
http://svn.apache.org/viewvc/lucene/nutch/dist/KEYS?view=diff&rev=537915&r1=537914&r2=537915
==
--- lucene/nutch/dist/KEYS (original)
+++ lucene/nutch/dist/KEYS Mon May 14 09:58:17 2007
@@ -1,3 +1,17 @@
+This file contains the PGP keys of various developers.
+Please don't use them for email unless you have to. Their main
+purpose is code signing.
+
+Examples of importing this file in your keystore:
+ gpg --import KEYS.txt
+ (need pgp and other examples here)
+
+Examples of adding your key to this file:
+ pgp -kxa  and append it to this file.
+ (pgpk -ll  && pgpk -xa ) >> this file.
+ (gpg --list-sigs 
+ && gpg --armor --export ) >> this file.
+
 pub   1024D/A7239D59 2005-10-12
   Key fingerprint = 4B96 409A 098D BD51 1DF2  BC18 DBAF 69BE A723 9D59
 uid  Doug Cutting (Lucene guy) <[EMAIL PROTECTED]>
@@ -73,11 +87,27 @@
 pub   1024D/0B7E6CFA 2006-07-06
 uid  Sami Siren <[EMAIL PROTECTED]>
 sig 30B7E6CFA 2006-07-06  Sami Siren <[EMAIL PROTECTED]>
+sig  E222DE4F 2007-05-02  Mathias Herberts <[EMAIL PROTECTED]>
+sig  911203E4 2007-05-02  Mathias Herberts <[EMAIL PROTECTED]>
+sig  302DA568 2007-05-03  Rodent of Unusual Size (DSA) <[EMAIL 
PROTECTED]>
+sig  2C312D2F 2007-05-03  Rodent of Unusual Size (DSS) <[EMAIL 
PROTECTED]>
+sig  F12F6072 2007-05-05  Fred Vos <[EMAIL PROTECTED]>
+sig 3990ED4AA 2007-05-02  Knut Anders Hatlen <[EMAIL PROTECTED]>
+sig 3311A3DE5 2007-05-05  Ruediger Pluem <[EMAIL PROTECTED]>
+sig  A99F75DD 2007-05-03  Rodent of Unusual Size <[EMAIL PROTECTED]>
+sig  5F298824 2007-05-06  Simon Pepping <[EMAIL PROTECTED]>
+sig  4358C584 2007-05-06  Vincent Hennebert <[EMAIL PROTECTED]>
+sig  4CEED75F 2007-05-07  Nick Burch <[EMAIL PROTECTED]>
+sig  C874155C 2007-05-07  Thilo Goetz (home key) <[EMAIL PROTECTED]>
+sig 388817402 2007-05-06  Thomas Vandahl <[EMAIL PROTECTED]>
+sig  01530235 2007-05-02  Luc Maisonobe (general purpose) <[EMAIL 
PROTECTED]>
+sig  40581837 2007-05-08  Nick Kew <[EMAIL PROTECTED]>
+sig  5F6B8B72 2007-05-12  Stefan Bodewig <[EMAIL PROTECTED]>
 sub   2048g/A3A3EC3F 2006-07-06
 sig  0B7E6CFA 2006-07-06  Sami Siren <[EMAIL PROTECTED]>
 
 -BEGIN PGP PUBLIC KEY BLOCK-
-Version: GnuPG v1.4.4 (GNU/Linux)
+Version: GnuPG v1.4.7 (GNU/Linux)
 
 mQGiBESs8FMRBADhMg5ONjSVuSVJoYbOL8vvoygjO9qH/MS21Ue2Hx2qLf8xB1/W
 baVL5kEH0ixkeg6H+qO4gGpyJ/cdww0v0CjbxRZw2R2QP1PtpZgioGv4YYNstUis
@@ -90,21 +120,46 @@
 GmZ8Q7LjYOnDyNIh+igVifkrlUlNKh3k8BVEXsH1OxffO28LzLQdU2FtaSBTaXJl
 biA8c2lyZW5AYXBhY2hlLm9yZz6IYAQTEQIAIAUCRKzwUwIbAwYLCQgHAwIEFQII
 AwQWAgMBAh4BAheAAAoJEAKlpgULfmz6vl0An0KCSRbIZjNFyQoDTR7Y/21tw94h
-AJ93zAzfB8woj0MuqiOtUZ29OX/m+7kCDQRErPBsEAgAxjiL5UbpPeA/k2P1QjtL
-Af/JTqG4lN6kBbRbvBbAOAYI0PYuskdsCxImdAopeJFnOm9fU0gGq4aggCeBlZhi
-GiSN865Gm/RwwA5Jbtl0hbE5ZcczhaF7iSsKEwrui1ATciYy432ZH28HpWViZkBP
-zJedwptd9uIrzSWa6OKB+xNLvPrYMmSPHvp6CRPRKyES71IpgXmw8Udy88q5PkMd
-xM0LKPANv68DPx1IWRAiZGTt3/0zJr9lxW3R4waIvF1rB549VPcLl/Z1kXnVrz4B
-7SuMMMtzDJsFD7F03K9jxYcIB+TySmQh7C77uFz3vH4XwviBtrvEi2rh7mGW7gwd
-KwADBggAr04EsfuSET1+BTmVhC7yp+Dy/NE4kzd18I4L4VPd2vhD2y0BrVFK4Q45
-TJV2JQvMZH/rIj0jRVMC7cKTwm2P0igf/7rxw/yvO8DjCYVVwI1zatg5lSUiNxDo
-h2O1g1co9GQATbdMg4YcT3ih6TgPyy10Vpq1D1yzWE7Sd8bllJY5iveK177QQ1IF
-WtrKv4T2TCdTEtt8lkPHvvQ/Ooc55eGg75DOUe/7JSHdW0xht1sqerEoFQd1M7hY
-5ss0MG+qnMOraqhTe54R8Le8zDyxh+AukIeo4PuyPSdMyoJAxcJq0YrItNI54sAc
-50PIIfr07ho0pWWiqxWWmq47IPa5e4hJBBgRAgAJBQJErPBsAhsMAAoJEAKlpgUL
-fmz6o9sAn2llnxCq/ZxaVT1252/g1IjCcJGIAKCgpJFo4pxVT8zCfPzWLnsBu7dL
-Nw==
-=nf3j
+AJ93zAzfB8woj0MuqiOtUZ29OX/m+4hGBBARAgAGBQJGOOPMAAoJEBVFs/7iIt5P
+au8AoJBhBjsv6RD1sYBsfhbaBsZyaENHAJoD/2IhBBMaBV3fNsTCbQilad2YAIhG
+BBARAgAGBQJGOOh1AAoJEGPQra6REgPkOL8An3TWNp6bYNIRwWRKUYsCEHWQ4BMu
+AJ9ix4bnBUf6R33seqwNz7Gp7z8eDohGBBARAgAGBQJGOghLAAoJEFCOrsUwLaVo
+azcAoIaOwIwqXgW+4xZ7GYPfJEFrHGPbAKCG+gIdVX4NmNaxERZPj2qLycs9w4hG
+BBARAgAGBQJGOghLAAoJEN26ZLosMS0vazcAoNgtdYT1uCNRLTdGaYhPvjGfVr04
+AJ9MhDZ+LUm6/+k783wrph1mRU2iY4hGBBARAgAGBQJGPMSSAAoJEJhw7/PxL2By
+uIoAnjog0y6x+vqOqJV+AWDbM99ZrOH+AKCC4u8eDndLGM9XwAp5Tl7jVr5oqIhG
+BBMRAgAGBQJGORnaAAoJEOHh8rCZDtSqY8YAn0n0/gjvZKp7/bwoIj9T7jBkjpbZ
+AJ40MSZ32QBcRnt2vP6vK7/SXpshE4hGBBMRAgAGBQJGPE+UAAoJEEwEKBgxGj3l
+9OQAn145exS7RQZNTU8+BjBzSmRBGL9BAKCJ66ln9ObH2GwEHEhlS0fhEaJAD4ic
+BBABAgAGBQJGOghLAAoJEJrNPMCpn3XdE8kD+wQBy+g+4TS8IVraka2wfibUpuqo
+6UdRXiOO0CUWGBNq1jPE7LthT7tSf76Scfk7p2OiG0DfmkCBhi6hD1TgESOUOuG6
+QJM/VTwNg8KwvKXMgEd0drh/waktIIZoo/PS+LGYsyiLEKk43FL86v

svn commit: r537872 - in /lucene/nutch/dist: HEADER.html KEYS

2007-05-14 Thread siren
Author: siren
Date: Mon May 14 08:15:34 2007
New Revision: 537872

URL: http://svn.apache.org/viewvc?view=rev&rev=537872
Log:
NUTCH-457 Create top level dist directory and checkin KEYS file to subversion

Added:
lucene/nutch/dist/HEADER.html
lucene/nutch/dist/KEYS

Added: lucene/nutch/dist/HEADER.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/dist/HEADER.html?view=auto&rev=537872
==
--- lucene/nutch/dist/HEADER.html (added)
+++ lucene/nutch/dist/HEADER.html Mon May 14 08:15:34 2007
@@ -0,0 +1,9 @@
+http://lucene.apache.org/nutch/";>Nutch Releases
+
+Please make sure you're downloading from http://www.apache.org/dyn/closer.cgi/lucene/nutch/";>a nearby
+mirror site, not from www.apache.org.
+
+For current development versions, see the Nutch http://people.apache.org/builds/lucene/nutch/nightly/";>nightly
+build directory.

Added: lucene/nutch/dist/KEYS
URL: http://svn.apache.org/viewvc/lucene/nutch/dist/KEYS?view=auto&rev=537872
==
--- lucene/nutch/dist/KEYS (added)
+++ lucene/nutch/dist/KEYS Mon May 14 08:15:34 2007
@@ -0,0 +1,187 @@
+pub   1024D/A7239D59 2005-10-12
+  Key fingerprint = 4B96 409A 098D BD51 1DF2  BC18 DBAF 69BE A723 9D59
+uid  Doug Cutting (Lucene guy) <[EMAIL PROTECTED]>
+sig 3A7239D59 2005-10-12  Doug Cutting (Lucene guy) <[EMAIL PROTECTED]>
+sub   2048g/ADDE5978 2005-10-12
+sig  A7239D59 2005-10-12  Doug Cutting (Lucene guy) <[EMAIL PROTECTED]>
+
+-BEGIN PGP PUBLIC KEY BLOCK-
+Version: GnuPG v1.4.1 (FreeBSD)
+
+mQGiBENNR5oRBAC2ZzxD2fXYht8qkfT/6tjWJxLG4KH2dLEWSYEzku8ZtJ7eA6X7
+/hcvZdhjGH0aA6MAEVSxh6LO1hmRARE2e2Br68j4TjwbQ0J5BOgkMMAArmQe7w6B
+RjKUI3H74Qbfjuk4Ebf1fNkRkpwuw+JxZu5pqpACqwv6nPhcSDDjbuA/1wCgj+++
+uxVSQMF4Xrd0hApOSYGHL8kD/jCU+vM3ILuFVTCgfC5RehmqwQo/f6KEv99jJSxX
+ClcksiLquOH8vMc3MV1YWOe4u93DI7iAYzCylS1s2Wn0bLEBrbdGKLMH4hSSMDRC
+pjnyvzvnEMhMU+Jn3LK6lQw4nHH+aDGFcYZ2pQen7JAcYz7l6QeTsvMnRV+v13K1
+/zRjA/9QUxrgg2N5WQnEhMegIWBKVhxQV6a2mSfeNd0ApxzdqdoHZNkUD+pKMB0F
+oQ9aP55KbtvFosurFgEmvwLIoMnQohxjIhdk0Hx3xMT17CtYl04F0C+QNxeXpWr7
+/B0kq8nALn17hXz5A1bFaiMHX86QmvNyMTDUC2VrVbkV251dlLQuRG91ZyBDdXR0
+aW5nIChMdWNlbmUgZ3V5KSA8Y3V0dGluZ0BhcGFjaGUub3JnPoheBBMRAgAeBQJD
+TUeaAhsDBgsJCAcDAgMVAgMDFgIBAh4BAheAAAoJENuvab6nI51ZjRAAoIZ96gYE
+f8QCDpXkBQqtNgRiF4t5AJ9JKMrN/Ow+Kyl75FU9U2KWyPoMk7kCDQRDTUejEAgA
+m3UdcglfOdgqI7Z9XUX38yqiFzNozSvTdOt3j6evIVvjJ3e0P87tUQlrdsbMcaXd
++PAc7EA5LE0eJlE9jR1/18tsIlYi/n1hxz1lWtaZ+9he3yTB12QmAf4MMTXaRBkI
+ZqwdwZxmL5V+2TmhFT2bIzPLgrMHNsA4dtQuBak41GC+VXovqitS9Xzse2Ki+U9u
+SiRPsD7x5DcgJm9sg/zqCNrvDN8vOC8iHa/CIqsZr3xaPgfQLZp6Xk3doHLc6IJ9
+6knDAZvzJFgfj8MGCQoOExE/1XoNGTWcgoiy0D30ADG+rtIbaRT8tdQ6m19/ytqd
+Zm7ibB7b78/pyfvvcB5tKwADBgf9GwdUdHUPjezlFpcCI/K3XHKdPLi00HJ2L1O8
+5pErBjDyZ5ey7vAMuYB5O31dB7pncSVsTdt9RRQHS+iLrv9aJjvYhV4yQU0ADkgC
+9qEvxm7wpn76AT+Z1LIay/vNoQPxnfWq+uZD/Lnku1VcnMZ5teSG6uJzApBGYsgN
+xpPPsobKKvclZdhO5NhhZLFZ0taWh4pna2jpDTLmyRa4kO7p7rIixsKxFfLUUc33
+2RqBomnm9eRlSvC4BBCq6M7YPLG0Rv5WmzuuWpc865EaMoBEtwPQBb4+qcMN69Lp
+3x6EaymTWmHx1o8aUjAxhORE/miy53eGPzIXY+csjMyAmSxDG4hJBBgRAgAJBQJD
+TUejAhsMAAoJENuvab6nI51ZlTIAn0oHlUPw+v1gVUJ8D2Nu26knOqJKAJ4spe/k
+Sc2xRlsNP3tZiO+jYMAFSg==
+=goQx
+-END PGP PUBLIC KEY BLOCK-
+pub  1024D/7C491924 2006-03-30 Piotr Kosiorowski <[EMAIL PROTECTED]>
+sig 3   7C491924 2006-03-30   Piotr Kosiorowski <[EMAIL PROTECTED]>
+sub  2048g/4A70BB35 2006-03-30
+sig 7C491924 2006-03-30   Piotr Kosiorowski <[EMAIL PROTECTED]>
+
+-BEGIN PGP PUBLIC KEY BLOCK-
+Version: GnuPG v1.2.7 (GNU/Linux)
+
+mQGiBEQrfF8RBACblz5gaIolsKdJgtdy913C+k/QXvaeg3R+8dXXkgVgC5vvRbUk
+Ei6UBRMU5H0cNE76d0XlMYP1MccqdowsfPfWxl04VViW6p+KHmBa2ICIWvq1PQXL
+XhocuRZn6dzfnxcFjsJlsKXtX+okzL9rc1AHiPsb+14XFQtd0/uxs/qeswCgyM68
+hxpwMZU4U0Q7yYkB8usVjbcD/iC65v+8DPhVgxp4o66JJqTYkBZ73mS4f/DDlQsL
+9qCj8h9rLYHmV85hSx3pBBDuz/HjIzu5ruj+l78H++WISXE82hj++OS0bpKnb+nV
+x/iN+b/Y0W0CzMEms+42LcNz1azvLL6ZBgLwnUePT1mBnOy6UgFW1XZGow/XO4Lr
+2py7BACP2WrV+rAzc8RcelmvE3eaAj0DJhAl2Brkdl7B4KDNpBTYZM2TaJ5G5pDK
+EhzH3O6IZP4dRh4iEipl+qcJ0eC5OlKHxqyXXbQYH5jzqkl+4cAQRkCliWuFrGcO
+o3XaOFE54dpY8FZbineEJLrg4Ynh592gO731IcP4gm401ORGv7QrUGlvdHIgS29z
+aW9yb3dza2kgPHBrb3Npb3Jvd3NraUBhcGFjaGUub3JnPoheBBMRAgAeBQJEK3xf
+AhsDBgsJCAcDAgMVAgMDFgIBAh4BAheAAAoJEEsO4ix8SRkkWLwAoLrn6dtn38yI
+8dja2k2lJJ7PVpOoAJ9qZO+QfOfJRf1H+1L6qOuviiDkR7kCDQREK3xpEAgAklbu
+2ctaceFu6nolNd3cnKNqDNppvSRSwDzZZytXjzV10E5VW7fYlN1+huOSV9nRLAIL
+stNloFiOdQGElT0t8Xi9N9X1BuzSkxWMKqDHaTOSnKNupCuDzz9F3oYXVMbLwZBG
+GJAMezd6WuCl+KyhsJgt0GD/H2Ucyck2CqTQRZFPOPOPB2urZbmw8F5bTI3u9J1Q
+ElwApNTrHS04HyNEq5o9j/iTMvvunnkliQFI0Z/flvfHaV6go3/ZhMeVkLU7m/mq
+bPh467HN0MTN5O+znak164nBumxcqD8yUF5TiWD42dykNffbN2ajZzgVvTxWerVV
+mqVMTetbhl3Hoaff0wADBQf/d+XRxh7etS3IO5Jvv85de9QvQPFm5JZpnTNfdnil
+b9G3WRjZIsdmAG2khtJNmlUMUegK0ej6jsCFmsWTqg8cbCG7TBcYySWKSTGklELu
+N69g9VaG60GUX6EOoEmfRMr

svn commit: r537869 - /lucene/nutch/dist/

2007-05-14 Thread siren
Author: siren
Date: Mon May 14 08:05:09 2007
New Revision: 537869

URL: http://svn.apache.org/viewvc?view=rev&rev=537869
Log: (empty)

Added:
lucene/nutch/dist/



svn commit: r537860 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt src/plugin/ontology/lib/commons-logging-1.0.3.jar

2007-05-14 Thread siren
Author: siren
Date: Mon May 14 07:51:59 2007
New Revision: 537860

URL: http://svn.apache.org/viewvc?view=rev&rev=537860
Log:
NUTCH-483 Remove redundant commons-logging jar from ontology plugin

Removed:
lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt
lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar
Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=537860&r1=537859&r2=537860
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon May 14 07:51:59 2007
@@ -16,6 +16,9 @@
 bots in robots.txt (Dogacan Guney via siren)
 
  6. NUTCH-482 - Remove redundant plugin lib-log4j (siren)
+ 
+ 7. NUTCH-483 - Remove redundant commons-logging jar from ontology plugin
+(siren)
   
 
 Release 0.9 - 2007-04-02




svn commit: r537857 - in /lucene/nutch/trunk: ./ src/plugin/ src/plugin/clustering-carrot2/ src/plugin/lib-log4j/ src/plugin/parse-pdf/ src/plugin/parse-rss/

2007-05-14 Thread siren
Author: siren
Date: Mon May 14 07:37:27 2007
New Revision: 537857

URL: http://svn.apache.org/viewvc?view=rev&rev=537857
Log:
NUTCH-482 Remove redundant plugin lib-log4j

Removed:
lucene/nutch/trunk/src/plugin/lib-log4j/
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
lucene/nutch/trunk/src/plugin/parse-pdf/build.xml
lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rss/build.xml
lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=537857&r1=537856&r2=537857
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon May 14 07:37:27 2007
@@ -14,7 +14,9 @@
 
  5. NUTCH-446 - RobotRulesParser should ignore Crawl-delay values of other
 bots in robots.txt (Dogacan Guney via siren)
- 
+
+ 6. NUTCH-482 - Remove redundant plugin lib-log4j (siren)
+  
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=537857&r1=537856&r2=537857
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Mon May 14 07:37:27 2007
@@ -33,7 +33,6 @@
  
  
  
- 
  
  
  
@@ -122,7 +121,6 @@
 
 
 
-
 
 
 

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml?view=diff&rev=537857&r1=537856&r2=537857
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml Mon May 14 
07:37:27 2007
@@ -21,21 +21,18 @@
 
   
   
-
 
   
 
   
   
 
-  
   
 
   
 
   
   
-
 
 
   

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?view=diff&rev=537857&r1=537856&r2=537857
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Mon May 14 
07:37:27 2007
@@ -40,7 +40,6 @@
 

   
-  

 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/build.xml?view=diff&rev=537857&r1=537856&r2=537857
==
--- lucene/nutch/trunk/src/plugin/parse-pdf/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/build.xml Mon May 14 07:37:27 2007
@@ -19,21 +19,8 @@
 
   
 
-  
-  
-
-  
-
-  
-  
-
-  
-
-  
-
   
   
-
 
 
   

Modified: lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml?view=diff&rev=537857&r1=537856&r2=537857
==
--- lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml Mon May 14 07:37:27 2007
@@ -27,12 +27,10 @@
  
   
   
-  

 

   
-  

 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/build.xml?view=diff&rev=537857&r1=537856&r2=537857
==
--- lucene/nutch/trunk/src/plugin/parse-rss/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/build.xml Mon May 14 07:37:27 2007
@@ -22,21 +22,18 @@
  
  

-   
  
 
  
  

  
- 

  
 
  
  

-   


  

Modified: lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml?view=diff&rev=537857&r1=537856&r2=537857
==
--- lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Mon May 14 07:37:27 2007
@@ -33,7 +33,6 @@

   
   
-  

 


svn commit: r537591 - in /lucene/nutch/trunk: site/ src/site/src/documentation/content/xdocs/

2007-05-13 Thread siren
Author: siren
Date: Sun May 13 07:51:20 2007
New Revision: 537591

URL: http://svn.apache.org/viewvc?view=rev&rev=537591
Log:
NUTCH-484 fix link to javadoc contributed by Gal Nitzan

Modified:
lucene/nutch/trunk/site/about.html
lucene/nutch/trunk/site/bot.html
lucene/nutch/trunk/site/credits.html
lucene/nutch/trunk/site/i18n.html
lucene/nutch/trunk/site/index.html
lucene/nutch/trunk/site/issue_tracking.html
lucene/nutch/trunk/site/linkmap.html
lucene/nutch/trunk/site/mailing_lists.html
lucene/nutch/trunk/site/nightly.html
lucene/nutch/trunk/site/tutorial.html
lucene/nutch/trunk/site/tutorial8.html
lucene/nutch/trunk/site/version_control.html
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/site.xml

Modified: lucene/nutch/trunk/site/about.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/about.html?view=diff&rev=537591&r1=537590&r2=537591
==
--- lucene/nutch/trunk/site/about.html (original)
+++ lucene/nutch/trunk/site/about.html Sun May 13 07:51:20 2007
@@ -99,7 +99,7 @@
 API Docs (0.8.x)
 
 
-http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API 
Docs (nightly)
+http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API
 Docs (nightly)
 
 
 Resources

Modified: lucene/nutch/trunk/site/bot.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/bot.html?view=diff&rev=537591&r1=537590&r2=537591
==
--- lucene/nutch/trunk/site/bot.html (original)
+++ lucene/nutch/trunk/site/bot.html Sun May 13 07:51:20 2007
@@ -99,7 +99,7 @@
 API Docs (0.8.x)
 
 
-http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API 
Docs (nightly)
+http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API
 Docs (nightly)
 
 
 Resources

Modified: lucene/nutch/trunk/site/credits.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.html?view=diff&rev=537591&r1=537590&r2=537591
==
--- lucene/nutch/trunk/site/credits.html (original)
+++ lucene/nutch/trunk/site/credits.html Sun May 13 07:51:20 2007
@@ -99,7 +99,7 @@
 API Docs (0.8.x)
 
 
-http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API 
Docs (nightly)
+http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API
 Docs (nightly)
 
 
 Resources

Modified: lucene/nutch/trunk/site/i18n.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/i18n.html?view=diff&rev=537591&r1=537590&r2=537591
==
--- lucene/nutch/trunk/site/i18n.html (original)
+++ lucene/nutch/trunk/site/i18n.html Sun May 13 07:51:20 2007
@@ -99,7 +99,7 @@
 API Docs (0.8.x)
 
 
-http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API 
Docs (nightly)
+http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API
 Docs (nightly)
 
 
 Resources

Modified: lucene/nutch/trunk/site/index.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/index.html?view=diff&rev=537591&r1=537590&r2=537591
==
--- lucene/nutch/trunk/site/index.html (original)
+++ lucene/nutch/trunk/site/index.html Sun May 13 07:51:20 2007
@@ -99,7 +99,7 @@
 API Docs (0.8.x)
 
 
-http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API 
Docs (nightly)
+http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API
 Docs (nightly)
 
 
 Resources

Modified: lucene/nutch/trunk/site/issue_tracking.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/issue_tracking.html?view=diff&rev=537591&r1=537590&r2=537591
==
--- lucene/nutch/trunk/site/issue_tracking.html (original)
+++ lucene/nutch/trunk/site/issue_tracking.html Sun May 13 07:51:20 2007
@@ -99,7 +99,7 @@
 API Docs (0.8.x)
 
 
-http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API 
Docs (nightly)
+http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API
 Docs (nightly)
 
 
 Resources

Modified: lucene/nutch/trunk/site/linkmap.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/linkmap.html?view=diff&rev=537591&r1=537590&r2=537591
==
--- lucene/nutch/trunk/site/linkmap.html (original)
+++ lucene/nutch/trunk/site/linkmap.html Sun M

svn commit: r536925 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java src/plugin/lib-http/src/test/org/apache/nutch/protocol/h

2007-05-10 Thread siren
Author: siren
Date: Thu May 10 09:29:51 2007
New Revision: 536925

URL: http://svn.apache.org/viewvc?view=rev&rev=536925
Log:
NUTCH-446 RobotRulesParser should ignore Crawl-delay values of other bots in 
robots.txt, contributed by Doğacan Güney

Modified:
lucene/nutch/trunk/CHANGES.txt

lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java

lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536925&r1=536924&r2=536925
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu May 10 09:29:51 2007
@@ -11,6 +11,9 @@
 (Eelco Lempsink via ab)
 
  4. NUTCH-456 - Parse msexcel plugin speedup (Heiko Dietze via siren)
+
+ 5. NUTCH-446 - RobotRulesParser should ignore Crawl-delay values of other
+bots in robots.txt (Dogacan Guney via siren)
  
 
 Release 0.9 - 2007-04-02

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=536925&r1=536924&r2=536925
==
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 Thu May 10 09:29:51 2007
@@ -389,15 +389,17 @@
   } else if ( (line.length() >= 12)
   && (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) 
{
 doneAgents = true;
-long crawlDelay = -1;
-String delay = line.substring("Crawl-Delay:".length(), 
line.length()).trim();
-if (delay.length() > 0) {
-  try {
-crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
-  } catch (Exception e) {
-LOG.info("can not parse Crawl-Delay:" + e.toString());
+if (addRules) {
+  long crawlDelay = -1;
+  String delay = line.substring("Crawl-Delay:".length(), 
line.length()).trim();
+  if (delay.length() > 0) {
+try {
+  crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
+} catch (Exception e) {
+  LOG.info("can not parse Crawl-Delay:" + e.toString());
+}
+currentRules.setCrawlDelay(crawlDelay);
   }
-  currentRules.setCrawlDelay(crawlDelay);
 }
   }
 }
@@ -500,7 +502,7 @@
 
   /** command-line main for testing */
   public static void main(String[] argv) {
-if (argv.length != 3) {
+if (argv.length < 3) {
   System.out.println("Usage:");
   System.out.println("   java   +");
   System.out.println("");
@@ -513,7 +515,7 @@
 try { 
   FileInputStream robotsIn= new FileInputStream(argv[0]);
   LineNumberReader testsIn= new LineNumberReader(new FileReader(argv[1]));
-  String[] robotNames= new String[argv.length - 1];
+  String[] robotNames= new String[argv.length - 2];
 
   for (int i= 0; i < argv.length - 2; i++) 
 robotNames[i]= argv[i+2];

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?view=diff&rev=536925&r1=536924&r2=536925
==
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 Thu May 10 09:29:51 2007
@@ -262,6 +262,26 @@
   }
 }
   }
+  
+  public void testCrawlDelay() {
+RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });
+String delayRule1 = "User-agent: nutchbot" + CR +
+"Crawl-delay: 10" + CR +
+"User-agent: foobot" + CR +
+"Crawl-delay: 20" + CR +
+"User-agent: *" + CR + 
+"Disallow:/baz" + CR;
+String delayRule2 = "User-agent: foobot" + CR +
+"Crawl-delay: 20" + CR +
+"User-agent: *" + CR + 
+

svn commit: r536909 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java

2007-05-10 Thread siren
Author: siren
Date: Thu May 10 09:13:15 2007
New Revision: 536909

URL: http://svn.apache.org/viewvc?view=rev&rev=536909
Log:
NUTCH-456 Parse msexcel plugin speedup contributed by Heiko Dietze

Modified:
lucene/nutch/trunk/CHANGES.txt

lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536909&r1=536908&r2=536909
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu May 10 09:13:15 2007
@@ -10,7 +10,8 @@
  3. NUTCH-393 - Indexer should handle null documents returned by filters.
 (Eelco Lempsink via ab)
 
-
+ 4. NUTCH-456 - Parse msexcel plugin speedup (Heiko Dietze via siren)
+ 
 
 Release 0.9 - 2007-04-02
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?view=diff&rev=536909&r1=536908&r2=536909
==
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
 Thu May 10 09:13:15 2007
@@ -40,10 +40,10 @@
   
   protected String extractText(InputStream input) throws Exception {
 
-String resultText = "";
+StringBuilder resultText = new StringBuilder();
 HSSFWorkbook wb = new HSSFWorkbook(input);
 if (wb == null) {
-  return resultText;
+  return resultText.toString();
 }
 
 HSSFSheet sheet;
@@ -69,25 +69,24 @@
 for (int k=0; k

svn commit: r517015 - in /lucene/nutch/trunk: ./ lib/ src/java/org/apache/nutch/parse/ src/plugin/ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/ src/plugin/index-more/src/test/ src/plu

2007-03-11 Thread siren
Author: siren
Date: Sun Mar 11 14:18:23 2007
New Revision: 517015

URL: http://svn.apache.org/viewvc?view=rev&rev=517015
Log:
merging 517012:516728 excluding changes made by dennis



Added:
lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar
  - copied unchanged from r516728, 
lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar
lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
  - copied unchanged from r516728, 
lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt
  - copied unchanged from r516728, 
lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt
lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar
  - copied unchanged from r516728, 
lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar

lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
  - copied unchanged from r516728, 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
Removed:
lucene/nutch/trunk/src/plugin/index-more/src/test/

lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html
lucene/nutch/trunk/src/plugin/parse-js/src/test/
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
lucene/nutch/trunk/src/plugin/build.xml

lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java

lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=517015&r1=517014&r2=517015
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Mar 11 14:18:23 2007
@@ -158,18 +158,11 @@
 53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins
 framework to operate properly (Heiko Dietze via mattmann)
 
-54. Change OutlinkExtractor to use Regular Expressions from JRE (siren)
-
-55. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan
+54. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan
 Groschupf via kubes)
 
-56. NUTCH-436 - Incorrect handling of relative paths when the embedded URL 
-   path is empty (kubes)
-   
-57. Replace oro with jre regular expressions in plugins, remove oro from
-    dependencies (siren)
-
-58. Remove redundant commons logging jars (siren)
+55. NUTCH-436 - Incorrect handling of relative paths when the embedded URL 
+path is empty (kubes)

 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?view=diff&rev=517015&r1=517014&r2=517015
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Sun Mar 11 14:18:23 2007
@@ -148,20 +148,8 @@
 
   
   
-  
-
-
-
-
-
-
-
-
-
-
-
-
-  
+  
   
 
   

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=517015&r1=517014&r2=517015
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Sun Mar 11 14:18:23 2007
@@ -1,4 +1,4 @@
-/*
+/**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,21 +14,28 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.nutch.parse;
 
 import java.net.MalformedURLException;
 import java.util.ArrayList;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache

svn commit: r516908 - /lucene/nutch/trunk/bin/nutch

2007-03-11 Thread siren
Author: siren
Date: Sun Mar 11 07:30:35 2007
New Revision: 516908

URL: http://svn.apache.org/viewvc?view=rev&rev=516908
Log:
revert to previous version as requested by ab

Modified:
lucene/nutch/trunk/bin/nutch

Modified: lucene/nutch/trunk/bin/nutch
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?view=diff&rev=516908&r1=516907&r2=516908
==
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Sun Mar 11 07:30:35 2007
@@ -148,7 +148,7 @@
   fi
 fi
 
-if $cygwin -a "X${JAVA_LIBRARY_PATH}" != "X"; then
+if [ $cygwin -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
   JAVA_LIBRARY_PATH=`cygpath -p -w "$JAVA_LIBRARY_PATH"`
 fi
 




svn commit: r516888 - /lucene/nutch/trunk/bin/nutch

2007-03-11 Thread siren
Author: siren
Date: Sun Mar 11 04:12:23 2007
New Revision: 516888

URL: http://svn.apache.org/viewvc?view=rev&rev=516888
Log:
fix bin/nutch: line 152: cygpath: command not found on linux (FC5), hope i am 
not breaking it for some other env

Modified:
lucene/nutch/trunk/bin/nutch

Modified: lucene/nutch/trunk/bin/nutch
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?view=diff&rev=516888&r1=516887&r2=516888
==
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Sun Mar 11 04:12:23 2007
@@ -148,7 +148,7 @@
   fi
 fi
 
-if [ $cygwin -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
+if $cygwin -a "X${JAVA_LIBRARY_PATH}" != "X"; then
   JAVA_LIBRARY_PATH=`cygpath -p -w "$JAVA_LIBRARY_PATH"`
 fi
 




svn commit: r516885 - /lucene/nutch/trunk/build.xml

2007-03-11 Thread siren
Author: siren
Date: Sun Mar 11 04:02:27 2007
New Revision: 516885

URL: http://svn.apache.org/viewvc?view=rev&rev=516885
Log:
reduce the size of .job from 19+M down to 14+M

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?view=diff&rev=516885&r1=516884&r2=516885
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Sun Mar 11 04:02:27 2007
@@ -148,8 +148,20 @@
 
   
   
-  
+  
+
+
+
+
+
+
+
+
+
+
+
+
+  
   
 
   




svn commit: r516870 - in /lucene/nutch/trunk: CHANGES.txt lib/commons-logging-api-1.0.4.jar src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt src/plugin/ontology/lib/commons-logging-1.0.3.jar

2007-03-11 Thread siren
Author: siren
Date: Sun Mar 11 00:25:25 2007
New Revision: 516870

URL: http://svn.apache.org/viewvc?view=rev&rev=516870
Log:
remove redundant commons-logging jars

Removed:
lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar
lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt
lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar
Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516870&r1=516869&r2=516870
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Mar 11 00:25:25 2007
@@ -168,6 +168,8 @@

 57. Replace oro with jre regular expressions in plugins, remove oro from
     dependencies (siren)
+
+58. Remove redundant commons logging jars (siren)

 Release 0.8 - 2006-07-25
 




svn commit: r516866 - in /lucene/nutch/trunk: CHANGES.txt lib/jakarta-oro-2.0.7.jar

2007-03-11 Thread siren
Author: siren
Date: Sun Mar 11 00:01:22 2007
New Revision: 516866

URL: http://svn.apache.org/viewvc?view=rev&rev=516866
Log:
Remove oro as dependency

Removed:
lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516866&r1=516865&r2=516866
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Mar 11 00:01:22 2007
@@ -166,6 +166,9 @@
 56. NUTCH-436 - Incorrect handling of relative paths when the embedded URL 
path is empty (kubes)

+57. Replace oro with jre regular expressions in plugins, remove oro from
+    dependencies (siren)
+   
 Release 0.8 - 2006-07-25
 
  0. Totally new architecture, based on hadoop




svn commit: r516865 - /lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

2007-03-10 Thread siren
Author: siren
Date: Sat Mar 10 23:36:56 2007
New Revision: 516865

URL: http://svn.apache.org/viewvc?view=rev&rev=516865
Log:
change urlnormalizer-regex to use regular expressions from jre

Modified:

lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

Modified: 
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?view=diff&rev=516865&r1=516864&r2=516865
==
--- 
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 Sat Mar 10 23:36:56 2007
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.urlnormalizer.regex;
 
 import java.net.URL;
@@ -28,6 +27,7 @@
 import java.util.List;
 import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -40,7 +40,6 @@
 
 import javax.xml.parsers.*;
 import org.w3c.dom.*;
-import org.apache.oro.text.regex.*;
 
 /**
  * Allows users to do regex substitutions on all/any URLs that are encountered,
@@ -65,16 +64,14 @@
* string.
*/
   private static class Rule {
-public Perl5Pattern pattern;
+public Pattern pattern;
 
 public String substitution;
   }
 
-  private HashMap scopedRules;
+  private HashMap> scopedRules;
   
-  private static final List EMPTY_RULES = Collections.EMPTY_LIST;
-
-  private PatternMatcher matcher = new Perl5Matcher();
+  private static final List EMPTY_RULES = Collections.EMPTY_LIST;
 
   /**
* The default constructor which is called from UrlNormalizerFactory
@@ -93,9 +90,9 @@
* configuration files for it.
*/
   public RegexURLNormalizer(Configuration conf, String filename)
-  throws IOException, MalformedPatternException {
+  throws IOException {
 super(conf);
-List rules = readConfigurationFile(filename);
+List rules = readConfigurationFile(filename);
 if (rules != null)
   scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules);
   }
@@ -106,9 +103,9 @@
 // the default constructor was called
 if (this.scopedRules == null) {
   String filename = getConf().get("urlnormalizer.regex.file");
-  scopedRules = new HashMap();
+  scopedRules = new HashMap>();
   URL url = getConf().getResource(filename);
-  List rules = null;
+  List rules = null;
   if (url == null) {
 LOG.warn("Can't load the default config file! " + filename);
 rules = EMPTY_RULES;
@@ -126,7 +123,7 @@
 
   // used in JUnit test.
   void setConfiguration(InputStream is, String scope) {
-List rules = readConfiguration(is);
+List rules = readConfiguration(is);
 scopedRules.put(scope, rules);
 LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " 
rules.");
   }
@@ -136,7 +133,7 @@
* patterns. It accepts a string url as input and returns the altered string.
*/
   public synchronized String regexNormalize(String urlString, String scope) {
-List curRules = (List)scopedRules.get(scope);
+List curRules = scopedRules.get(scope);
 if (curRules == null) {
   // try to populate
   String configFile = getConf().get("urlnormalizer.regex.file." + scope);
@@ -147,7 +144,6 @@
   LOG.warn("Can't load resource for config file: " + configFile);
 } else {
   try {
-InputStream is = resource.openStream();
 curRules = readConfiguration(resource.openStream());
 scopedRules.put(scope, curRules);
   } catch (Exception e) {
@@ -162,14 +158,11 @@
 }
 if (curRules == EMPTY_RULES || curRules == null) {
   // use global rules
-  curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
+  curRules = scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
 }
-Iterator i = curRules.iterator();
-while (i.hasNext()) {
-  Rule r = (Rule) i.next();
-  urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution(
-  r.substitution), urlString, Util.SUBSTITUTE_ALL); // a

svn commit: r516862 - /lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java

2007-03-10 Thread siren
Author: siren
Date: Sat Mar 10 22:50:10 2007
New Revision: 516862

URL: http://svn.apache.org/viewvc?view=rev&rev=516862
Log:
change urlnormalizer-basic to use regular expressions from jre

Modified:

lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java

Modified: 
lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?view=diff&rev=516862&r1=516861&r2=516862
==
--- 
lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 Sat Mar 10 22:50:10 2007
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,60 +14,62 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.urlnormalizer.basic;
 
 import java.net.URL;
 import java.net.MalformedURLException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
-// Commons Logging imports
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
-// Nutch imports
 import org.apache.nutch.net.URLNormalizer;
-import org.apache.nutch.util.LogUtil;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.oro.text.regex.*;
 
-/** Converts URLs to a normal form . */
+/**
+ * Converts URLs to a normal form.
+ * 
+ * All substitutions will be done step by step, to ensure that certain
+ * constellations will be normalized, too.
+ * 
+ * 
+ * For example: "/aa/bb/../../cc/../foo.html will be normalized in the 
following
+ * manner: "/aa/bb/../../cc/../foo.html" "/aa/../cc/../foo.html"
+ * "/cc/../foo.html" "/foo.html".
+ * 
+ * 
+ * The normalization also takes care of leading "/../", which will be replaced
+ * by "/", because this is a rather a sign of bad webserver configuration than
+ * of a wanted link. For example, urls like "http://www.foo.com/../"; should
+ * return a http 404 error instead of redirecting to "http://www.foo.com";.
+ * 
+ */
 public class BasicURLNormalizer implements URLNormalizer {
 public static final Log LOG = LogFactory.getLog(BasicURLNormalizer.class);
 
-private Perl5Compiler compiler = new Perl5Compiler();
-private ThreadLocal matchers = new ThreadLocal() {
-protected synchronized Object initialValue() {
-  return new Perl5Matcher();
-}
-  };
-private Rule relativePathRule = null;
-private Rule leadingRelativePathRule = null;
+/**
+ * This pattern tries to find spots like "/xx/../" in the url, which could
+ * be replaced by "/" xx consists of chars, different then "/" (slash) and
+ * needs to have at least one char different from ".".
+ */
+private static final Pattern RELATIVE_PATH_PATTERN = 
Pattern.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)");
+
+private static final String RELATIVE_PATH_SUBSTITUTION="/";
+
+/**
+ * This pattern tries to find spots like leading "/../" in the url, which
+ * could be replaced by "/".
+ */
+private static final Pattern LEADING_RELATIVE_PATH_PATTERN = 
Pattern.compile("^(/\\.\\./)+");
+
+private static final String LEADING_RELATIVE_PATH_SUBSTITUTION="/";
 
 private Configuration conf;
 
+
 public BasicURLNormalizer() {
-  try {
-// this pattern tries to find spots like "/xx/../" in the url, which
-// could be replaced by "/" xx consists of chars, different then "/"
-// (slash) and needs to have at least one char different from "."
-relativePathRule = new Rule();
-relativePathRule.pattern = (Perl5Pattern)
-  compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)",
-   Perl5Compiler.READ_ONLY_MASK);
-relativePathRule.substitution = new Perl5Substitution("/");
-
-// this pattern tries to find spots like leading "/../" in the url,
-// which could be replaced by "/"
-leadingRelativePathRule = new Rule();
-leadingRelativePathRule.pattern = (Perl5Pattern)
-  compiler.compile("^(/\\.\\./)+", Perl5Co

svn commit: r516788 - in /lucene/nutch/trunk/src/plugin: ./ parse-js/src/java/org/apache/nutch/ parse-js/src/java/org/apache/nutch/parse/js/ parse-js/src/test/ parse-js/src/test/org/ parse-js/src/test

2007-03-10 Thread siren
Author: siren
Date: Sat Mar 10 13:39:04 2007
New Revision: 516788

URL: http://svn.apache.org/viewvc?view=rev&rev=516788
Log:
change parse-js to use regular expressions from jre, add junit test, moved 
package.html to proper place

Added:

lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html
  - copied unchanged from r516662, 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
lucene/nutch/trunk/src/plugin/parse-js/src/test/
lucene/nutch/trunk/src/plugin/parse-js/src/test/org/
lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/
lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/

lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java
Removed:

lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
Modified:
lucene/nutch/trunk/src/plugin/build.xml

lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=516788&r1=516787&r2=516788
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Sat Mar 10 13:39:04 2007
@@ -89,6 +89,7 @@
  
  
  
+ 
  
  
  

Modified: 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=516788&r1=516787&r2=516788
==
--- 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 Sat Mar 10 13:39:04 2007
@@ -25,6 +25,8 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -40,13 +42,6 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.Pattern;
-import org.apache.oro.text.regex.PatternCompiler;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.PatternMatcherInput;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
@@ -54,11 +49,24 @@
 import org.w3c.dom.NodeList;
 
 /**
- * This class is a heuristic link extractor for JavaScript files and
- * code snippets. The general idea of a two-pass regex matching comes from
- * Heritrix. Parts of the code come from OutlinkExtractor.java
- * by Stephan Strittmatter.
- *
+ * 
+ * This class is a heuristic link extractor for JavaScript files and code
+ * snippets. The general idea of a two-pass regex matching comes from Heritrix.
+ * Parts of the code come from OutlinkExtractor.java by Stephan Strittmatter.
+ * 
+ * 
+ * 
+ * This Filter extracts javascript from following locations:
+ * 
+ * from inside <script> tags
+ * from html 4.0 events like Window: onload,onunload, Form:
+ * onchange,onsubmit,onreset,onselect,onblur,onfocus Keyboard:
+ * onkeydown,onkeypress,onkeyup Mouse:
+ * onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
+ * 
+ * a href starting with literal "javascript"
+ * 
+ * 
  * @author Andrzej Bialecki <[EMAIL PROTECTED]>
  */
 public class JSParseFilter implements HtmlParseFilter, Parser {
@@ -97,6 +105,7 @@
 Node lNode = n.getAttributes().getNamedItem("language");
 if (lNode == null) lang = "javascript";
 else lang = lNode.getNodeValue();
+//XXX lang is not checked??
 StringBuffer script = new StringBuffer();
 NodeList nn = n.getChildNodes();
 if (nn.getLength() > 0) {
@@ -104,9 +113,9 @@
 if (i > 0) script.append('\n');
 script.append(nn.item(i).getNodeValue());
   }
-  // if (LOG.isInfoEnabled()) {
-  //   LOG.info("script: language=" + lang + ", text: " + 
script.toString());
-  // }
+  if (LOG.isDebugEnabled()) {
+LOG.info("script: language=" + lang + ", text: " + 
scrip

svn commit: r516784 - /lucene/nutch/trunk/src/plugin/build.xml

2007-03-10 Thread siren
Author: siren
Date: Sat Mar 10 13:02:16 2007
New Revision: 516784

URL: http://svn.apache.org/viewvc?view=rev&rev=516784
Log:
enable junit tests on index-more

Modified:
lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=516784&r1=516783&r2=516784
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Sat Mar 10 13:02:16 2007
@@ -83,6 +83,7 @@
   
 
  
+ 
  
  
  




svn commit: r516778 - in /lucene/nutch/trunk/src/plugin/index-more/src: java/org/apache/nutch/indexer/more/ test/ test/org/ test/org/apache/ test/org/apache/nutch/ test/org/apache/nutch/indexer/ test/

2007-03-10 Thread siren
Author: siren
Date: Sat Mar 10 12:11:43 2007
New Revision: 516778

URL: http://svn.apache.org/viewvc?view=rev&rev=516778
Log:
change MoreIndexingFilter to use regular expressions from jre

Added:
lucene/nutch/trunk/src/plugin/index-more/src/test/
lucene/nutch/trunk/src/plugin/index-more/src/test/org/
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/

lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/

lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java
Modified:

lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?view=diff&rev=516778&r1=516777&r2=516778
==
--- 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Sat Mar 10 12:11:43 2007
@@ -16,14 +16,6 @@
  */
 package org.apache.nutch.indexer.more;
 
-
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.MalformedPatternException;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
@@ -56,6 +48,8 @@
 
 import java.util.Date;
 import java.util.TimeZone;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.lang.time.DateUtils;
 
@@ -244,21 +238,15 @@
   // Patterns used to extract filename from possible non-standard
   // HTTP header "Content-Disposition". Typically it looks like:
   // Content-Disposition: inline; filename="foo.ppt"
-  private PatternMatcher matcher = new Perl5Matcher();
 
   private Configuration conf;
-  static Perl5Pattern patterns[] = {null, null};
+  static Pattern patterns[] = new Pattern[2];
   static {
-Perl5Compiler compiler = new Perl5Compiler();
-try {
   // order here is important
   patterns[0] =
-(Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]");
+Pattern.compile("\\bfilename=['\"](.+)['\"]");
   patterns[1] =
-(Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b");
-} catch (MalformedPatternException e) {
-  // just ignore
-}
+Pattern.compile("\\bfilename=(\\S+)\\b");
   }
 
   private Document resetTitle(Document doc, ParseData data, String url) {
@@ -266,16 +254,28 @@
 if (contentDisposition == null)
   return doc;
 
-MatchResult result;
-for (int i=0; ihttp://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java?view=auto&rev=516778
==
--- 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java
 Sat Mar 10 12:11:43 2007
@@ -0,0 +1,36 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.nutch.indexer.more;
+
+import junit.framework.TestCase;
+
+public class MoreIndexingFilterTest extends TestCase {
+
+  public void testGetFileNamePlain() {
+assertMatches("attachment; filename=genome.jpeg;", "genome.jpeg");
+assertMatches("attachment; filename=\"genome.jpeg

svn commit: r516758 - in /lucene/nutch/trunk: CHANGES.txt lib/jakarta-oro-2.0.7.jar

2007-03-10 Thread siren
Author: siren
Date: Sat Mar 10 09:41:17 2007
New Revision: 516758

URL: http://svn.apache.org/viewvc?view=rev&rev=516758
Log:
doh! putting oro back since it is still used outside core

Added:
lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar   (with props)
Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516758&r1=516757&r2=516758
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Mar 10 09:41:17 2007
@@ -158,8 +158,7 @@
 53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins
 framework to operate properly (Heiko Dietze via mattmann)
 
-54. Change OutlinkExtractor to use Regular Expressions from JRE, get rid
-of ORO dependency (siren)
+54. Change OutlinkExtractor to use Regular Expressions from JRE (siren)
 
 
 Release 0.8 - 2006-07-25

Added: lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar?view=auto&rev=516758
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
--
svn:mime-type = application/octet-stream




svn commit: r516754 - in /lucene/nutch/trunk: CHANGES.txt lib/jakarta-oro-2.0.7.jar src/java/org/apache/nutch/parse/OutlinkExtractor.java

2007-03-10 Thread siren
Author: siren
Date: Sat Mar 10 09:30:04 2007
New Revision: 516754

URL: http://svn.apache.org/viewvc?view=rev&rev=516754
Log:
Change OutlinkExtractor to use Regular Expressions from JRE, get rid of ORO 
dependency

Removed:
lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516754&r1=516753&r2=516754
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Mar 10 09:30:04 2007
@@ -158,6 +158,9 @@
 53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins
 framework to operate properly (Heiko Dietze via mattmann)
 
+54. Change OutlinkExtractor to use Regular Expressions from JRE, get rid
+of ORO dependency (siren)
+
 
 Release 0.8 - 2006-07-25
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=516754&r1=516753&r2=516754
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Sat Mar 10 09:30:04 2007
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,28 +14,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.net.MalformedURLException;
 import java.util.ArrayList;
-import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.Pattern;
-import org.apache.oro.text.regex.PatternCompiler;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.PatternMatcherInput;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
 
 /**
- * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s 
- * / URLs from plain text using Regular Expressions.
+ * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s / 
URLs from
+ * plain text using Regular Expressions.
  * 
  * @see http://wiki.java.net/bin/view/Javapedia/RegularExpressions";>Comparison
@@ -44,12 +37,14 @@
  *  
  * 
  * @author Stephan Strittmatter - http://www.sybit.de
- * @version 1.0
+ *
  * @since 0.7
  */
 public class OutlinkExtractor {
   private static final Log LOG = LogFactory.getLog(OutlinkExtractor.class);
 
+  private static final Outlink[] NO_LINKS = new Outlink[0];
+
   /**
* Regex pattern to get URLs within a plain text.
* 
@@ -57,190 +52,63 @@
*  
href="http://www.truerwords.net/articles/ut/urlactivation.html";>http://www.truerwords.net/articles/ut/urlactivation.html
*  
*/
-  private static final String URL_PATTERN = 
-
"([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+  private static final String URL_PATTERN = 
"([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+
+  static final Pattern urlPattern = Pattern.compile(URL_PATTERN);
 
   /**
-   * Extracts Outlink from given plain text.
-   * Applying this method to non-plain-text can result in extremely lengthy
-   * runtimes for parasitic cases (postscript is a known example).
-   * @param plainText  the plain text from wich URLs should be extracted.
+   * Extracts outlinks from a plain text.
+   * 
+   * @param plainText
* 
-   * @return Array of Outlinks within found in plainText
+   * @return Array of Outlink s within found in plainText
*/
-  public static Outlink[] getOutlinks(final String plainText, Configuration 
conf) {
-return OutlinkExtractor.getOutlinks(plainText, "", conf);
+  public static Outlink[] getOutlinks(final String plainText, Configuration 
conf){
+return getOutlinks(plainText, null, conf);
   }
 
+  
   /**
-   * Extracts Outlink from given plain text and adds anchor
-   * to the extracted Outlinks
-   * 
-   * @param plainText the plain text from wich URLs should be ex

svn commit: r499878 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/Indexer.java src/java/org/apache/nutch/segment/SegmentMerger.java src/java/org/apache/nutch/segment/SegmentR

2007-01-25 Thread siren
Author: siren
Date: Thu Jan 25 10:11:59 2007
New Revision: 499878

URL: http://svn.apache.org/viewvc?view=rev&rev=499878
Log:
NUTCH-433

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=499878&r1=499877&r2=499878
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Jan 25 10:11:59 2007
@@ -139,6 +139,9 @@
 
 45. NUTCH-68 - Add a tool to generate arbitrary fetchlists. (ab)
 
+46. NUTCH-433 - java.io.EOFException in newer nightlies in mergesegs
+or indexing from hadoop.io.DataOutputBuffer (siren)
+
 
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=499878&r1=499877&r2=499878
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu Jan 
25 10:11:59 2007
@@ -24,7 +24,6 @@
 import org.apache.commons.logging.LogFactory;
 
 import org.apache.hadoop.io.*;
-import org.apache.nutch.fetcher.Fetcher;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
@@ -51,41 +50,12 @@
 import org.apache.nutch.metadata.Nutch;
 
 /** Create indexes for segments. */
-public class Indexer extends ToolBase implements Reducer {
+public class Indexer extends ToolBase implements Reducer, Mapper {
   
   public static final String DONE_NAME = "index.done";
 
   public static final Log LOG = LogFactory.getLog(Indexer.class);
 
-  /** Wraps inputs in an [EMAIL PROTECTED] ObjectWritable}, to permit merging 
different
-   * types in reduce. */
-  public static class InputFormat extends SequenceFileInputFormat {
-public RecordReader getRecordReader(FileSystem fs, FileSplit split,
-JobConf job, Reporter reporter)
-  throws IOException {
-
-  reporter.setStatus(split.toString());
-  
-  return new SequenceFileRecordReader(job, split) {
-  public synchronized boolean next(Writable key, Writable value)
-throws IOException {
-ObjectWritable wrapper = (ObjectWritable)value;
-try {
-  wrapper.set(getValueClass().newInstance());
-} catch (Exception e) {
-  throw new IOException(e.toString());
-}
-return super.next(key, (Writable)wrapper.get());
-  }
-  
-  // override the default - we want ObjectWritable-s here
-  public Writable createValue() {
-return new ObjectWritable();
-  }
-};
-}
-  }
-
   /** Unwrap Lucene Documents created by reduce and add them to an index. */
   public static class OutputFormat
 extends org.apache.hadoop.mapred.OutputFormatBase {
@@ -290,12 +260,9 @@
 
 job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
 job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME));
+job.setInputFormat(SequenceFileInputFormat.class);
 
-job.setInputFormat(InputFormat.class);
-//job.setInputKeyClass(Text.class);
-//job.setInputValueClass(ObjectWritable.class);
-
-//job.setCombinerClass(Indexer.class);
+job.setMapperClass(Indexer.class);
 job.setReducerClass(Indexer.class);
 
 job.setOutputPath(indexDir);
@@ -332,6 +299,11 @@
   LOG.fatal("Indexer: " + StringUtils.stringifyException(e));
   return -1;
 }
+  }
+
+  public void map(WritableComparable key, Writable value,
+  OutputCollector output, Reporter reporter) throws IOException {
+output.collect(key, new ObjectWritable(value));
   }
 
 }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?view=diff&rev=499878&r1=499877&r2=499878
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Thu 
Jan 25 10:11:59 2007
@@ -32,9 +32,7 @@
 import org.apache.hadoop.util.Progressable;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Generator;
-import org.apache.nutch.fetcher.Fetcher;
 import org.apache.nutch.metadata.MetaWrapper;
-import org.apache.nutch.metadat

svn commit: r497867 - /lucene/nutch/trunk/conf/

2007-01-19 Thread siren
Author: siren
Date: Fri Jan 19 08:37:35 2007
New Revision: 497867

URL: http://svn.apache.org/viewvc?view=rev&rev=497867
Log:
NUTCH-400

Modified:
lucene/nutch/trunk/conf/automaton-urlfilter.txt.template
lucene/nutch/trunk/conf/common-terms.utf8
lucene/nutch/trunk/conf/configuration.xsl
lucene/nutch/trunk/conf/crawl-tool.xml
lucene/nutch/trunk/conf/crawl-urlfilter.txt.template
lucene/nutch/trunk/conf/mime-types.xml
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/conf/regex-normalize.xml.template
lucene/nutch/trunk/conf/regex-urlfilter.txt.template
lucene/nutch/trunk/conf/subcollections.xml.template

Modified: lucene/nutch/trunk/conf/automaton-urlfilter.txt.template
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/automaton-urlfilter.txt.template?view=diff&rev=497867&r1=497866&r2=497867
==
--- lucene/nutch/trunk/conf/automaton-urlfilter.txt.template (original)
+++ lucene/nutch/trunk/conf/automaton-urlfilter.txt.template Fri Jan 19 
08:37:35 2007
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # The default url filter.
 # Better for whole-internet crawling.
 

Modified: lucene/nutch/trunk/conf/common-terms.utf8
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/common-terms.utf8?view=diff&rev=497867&r1=497866&r2=497867
==
--- lucene/nutch/trunk/conf/common-terms.utf8 (original)
+++ lucene/nutch/trunk/conf/common-terms.utf8 Fri Jan 19 08:37:35 2007
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Common terms and phrases which will be indexed in n-grams
 # in order to optimize search.
 content:a

Modified: lucene/nutch/trunk/conf/configuration.xsl
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/configuration.xsl?view=diff&rev=497867&r1=497866&r2=497867
==
--- lucene/nutch/trunk/conf/configuration.xsl (original)
+++ lucene/nutch/trunk/conf/configuration.xsl Fri Jan 19 08:37:35 2007
@@ -1,4 +1,20 @@
 
+
 http://www.w3.org/1999/XSL/Transform"; version="1.0">
 
 

Modified: lucene/nutch/trunk/conf/crawl-tool.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/crawl-tool.xml?view=diff&rev=497867&r1=497866&r2=497867
==
--- lucene/nutch/trunk/conf/crawl-tool.xml (original)
+++ lucene/nutch/trunk/conf/crawl-tool.xml Fri Jan 19 08:37:35 2007
@@ -1,4 +1,20 @@
  
+
 
 
 

Modified: lucene/nutch/trunk/conf/crawl-urlfilter.txt.template
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/crawl-urlfilter.txt.template?view=diff&rev=497867&r1=497866&r2=497867
==
--- lucene/nutch/trunk/conf/crawl-urlfilter.txt.template (original)
+++ lucene/nutch/trunk/conf/crawl-urlfilter.txt.template Fri Jan 19 08:37:35 
2007
@@ -1,3 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (th

svn commit: r497859 [3/3] - in /lucene/nutch/trunk: ./ src/plugin/ src/plugin/analysis-de/ src/plugin/analysis-fr/ src/plugin/clustering-carrot2/ src/plugin/clustering-carrot2/src/java/org/apache/nutc

2007-01-19 Thread siren
Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/cached.jsp?view=diff&rev=497859&r1=497858&r2=497859
==
--- lucene/nutch/trunk/src/web/jsp/cached.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/cached.jsp Fri Jan 19 08:17:32 2007
@@ -1,3 +1,19 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
 <%@ page
   session="false"
   contentType="text/html; charset=UTF-8"

Modified: lucene/nutch/trunk/src/web/jsp/cluster.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/cluster.jsp?view=diff&rev=497859&r1=497858&r2=497859
==
--- lucene/nutch/trunk/src/web/jsp/cluster.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/cluster.jsp Fri Jan 19 08:17:32 2007
@@ -1,3 +1,19 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
 <%
 
 // @author Dawid Weiss

Modified: lucene/nutch/trunk/src/web/jsp/explain.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/explain.jsp?view=diff&rev=497859&r1=497858&r2=497859
==
--- lucene/nutch/trunk/src/web/jsp/explain.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/explain.jsp Fri Jan 19 08:17:32 2007
@@ -1,3 +1,19 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
 <%@ page 
   session="false"
   contentType="text/html; charset=UTF-8"

Modified: lucene/nutch/trunk/src/web/jsp/index.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/index.jsp?view=diff&rev=497859&r1=497858&r2=497859
==
--- lucene/nutch/trunk/src/web/jsp/index.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/index.jsp Fri Jan 19 08:17:32 2007
@@ -1,3 +1,19 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
 <%@ page
   session="f

svn commit: r497540 - in /lucene/nutch/trunk/contrib/web2/plugins: web-query-propose-ontology/src/web/web-query-propose-ontology/ web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/ web

2007-01-18 Thread siren
Author: siren
Date: Thu Jan 18 11:18:52 2007
New Revision: 497540

URL: http://svn.apache.org/viewvc?view=rev&rev=497540
Log:
NUTCH-400

Modified:

lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/src/web/web-query-propose-ontology/propose.jsp

lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp
lucene/nutch/trunk/contrib/web2/plugins/web-resources/src/web/footer.jsp

lucene/nutch/trunk/contrib/web2/plugins/web-subcollection/src/web/web-subcollection/select.jsp

Modified: 
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/src/web/web-query-propose-ontology/propose.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/src/web/web-query-propose-ontology/propose.jsp?view=diff&rev=497540&r1=497539&r2=497540
==
--- 
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/src/web/web-query-propose-ontology/propose.jsp
 (original)
+++ 
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/src/web/web-query-propose-ontology/propose.jsp
 Thu Jan 18 11:18:52 2007
@@ -1,3 +1,19 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
 <%@ page session="false"%>
 <%@ taglib prefix="tiles" uri="http://jakarta.apache.org/struts/tags-tiles"%>
 <%@ taglib prefix="c" uri="http://java.sun.com/jstl/core"%>

Modified: 
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp?view=diff&rev=497540&r1=497539&r2=497540
==
--- 
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp
 (original)
+++ 
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp
 Thu Jan 18 11:18:52 2007
@@ -1,3 +1,19 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
 <%@ page session="false"%>
 <%@ taglib prefix="tiles" uri="http://jakarta.apache.org/struts/tags-tiles"%>
 <%@ taglib prefix="c" uri="http://java.sun.com/jstl/core"%>

Modified: 
lucene/nutch/trunk/contrib/web2/plugins/web-resources/src/web/footer.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-resources/src/web/footer.jsp?view=diff&rev=497540&r1=497539&r2=497540
==
--- lucene/nutch/trunk/contrib/web2/plugins/web-resources/src/web/footer.jsp 
(original)
+++ lucene/nutch/trunk/contrib/web2/plugins/web-resources/src/web/footer.jsp 
Thu Jan 18 11:18:52 2007
@@ -1,3 +1,19 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not u

svn commit: r496358 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Generator.java

2007-01-15 Thread siren
Author: siren
Date: Mon Jan 15 07:02:37 2007
New Revision: 496358

URL: http://svn.apache.org/viewvc?view=rev&rev=496358
Log:
fix NUTCH-430

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=496358&r1=496357&r2=496358
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Jan 15 07:02:37 2007
@@ -133,6 +133,9 @@
 
 43. NUTCH-428 - NullPointerException thrown when agent name is not
 configured properly. Changed to throw RuntimeException instead.
+(siren)
+
+44. NUTCH-430 - Integer overflow in HashComparator.compare (siren)
 
 
 Release 0.8 - 2006-07-25

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=496358&r1=496357&r2=496358
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon Jan 
15 07:02:37 2007
@@ -264,39 +264,33 @@
   output.collect(entry.url, entry.datum);
 }
   }
-  
+
   /** Sort fetch lists by hash of URL. */
   public static class HashComparator extends WritableComparator {
-public HashComparator() { super(Text.class); }
+public HashComparator() {
+  super(Text.class);
+}
 
 public int compare(WritableComparable a, WritableComparable b) {
-  Text url1 = (Text)a;
-  Text url2 = (Text)b;
+  Text url1 = (Text) a;
+  Text url2 = (Text) b;
   int hash1 = hash(url1.getBytes(), 0, url1.getLength());
   int hash2 = hash(url2.getBytes(), 0, url2.getLength());
-  if (hash1 != hash2) {
-return hash1 - hash2;
-  }
-  return compareBytes(url1.getBytes(), 0, url1.getLength(),
-  url2.getBytes(), 0, url2.getLength());
+  return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1));
 }
 
-
 public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
   int hash1 = hash(b1, s1, l1);
   int hash2 = hash(b2, s2, l2);
-  if (hash1 != hash2) {
-return hash1 - hash2;
-  }
-  return compareBytes(b1, s1, l1, b2, s2, l2);
+  return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1));
 }
 
 private static int hash(byte[] bytes, int start, int length) {
   int hash = 1;
   // make later bytes more significant in hash code, so that sorting by
   // hashcode correlates less with by-host ordering.
-  for (int i = length-1; i >= 0; i--)
-hash = (31 * hash) + (int)bytes[start+i];
+  for (int i = length - 1; i >= 0; i--)
+hash = (31 * hash) + (int) bytes[start + i];
   return hash;
 }
   }




svn commit: r495762 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java

2007-01-12 Thread siren
Author: siren
Date: Fri Jan 12 14:12:15 2007
New Revision: 495762

URL: http://svn.apache.org/viewvc?view=rev&rev=495762
Log:
NUTCH-428

Modified:
lucene/nutch/trunk/CHANGES.txt

lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=495762&r1=495761&r2=495762
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Jan 12 14:12:15 2007
@@ -131,6 +131,9 @@
 42. NUTCH-420 - Fix a bug in DeleteDuplicates where results depended on the
 order in which IndexDoc-s are processed. (Dogacan Guney via ab)
 
+43. NUTCH-428 - NullPointerException thrown when agent name is not
+configured properly. Changed to throw RuntimeException instead.
+
 
 Release 0.8 - 2006-07-25
 

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=495762&r1=495761&r2=495762
==
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 Fri Jan 12 14:12:15 2007
@@ -223,6 +223,9 @@
 // Grab the agent names we advertise to robots files.
 //
 String agentName = conf.get("http.agent.name");
+if (null == agentName) {
+  throw new RuntimeException("Agent name not configured!");
+}
 String agentNames = conf.get("http.robots.agents");
 StringTokenizer tok = new StringTokenizer(agentNames, ",");
 ArrayList agents = new ArrayList();




svn commit: r495716 - in /lucene/nutch/trunk: site/nightly.html site/nightly.pdf src/site/src/documentation/content/xdocs/nightly.xml

2007-01-12 Thread siren
Author: siren
Date: Fri Jan 12 12:18:27 2007
New Revision: 495716

URL: http://svn.apache.org/viewvc?view=rev&rev=495716
Log:
fix url to nightly builds

Modified:
lucene/nutch/trunk/site/nightly.html
lucene/nutch/trunk/site/nightly.pdf
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/nightly.xml

Modified: lucene/nutch/trunk/site/nightly.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/nightly.html?view=diff&rev=495716&r1=495715&r2=495716
==
--- lucene/nutch/trunk/site/nightly.html (original)
+++ lucene/nutch/trunk/site/nightly.html Fri Jan 12 12:18:27 2007
@@ -162,7 +162,7 @@
 
 
 
-http://people.apache.org/dist/lucene/nutch/nightly/";>Nutch nightly 
builds (0.9-dev)
+http://people.apache.org/builds/lucene/nutch/nightly/";>Nutch nightly 
builds (0.9-dev)
 
   
 

Modified: lucene/nutch/trunk/site/nightly.pdf
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/nightly.pdf?view=diff&rev=495716&r1=495715&r2=495716
==
--- lucene/nutch/trunk/site/nightly.pdf (original)
+++ lucene/nutch/trunk/site/nightly.pdf Fri Jan 12 12:18:27 2007
@@ -55,7 +55,7 @@
 /Rect [ 90.0 509.0 188.676 497.0 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (http://people.apache.org/dist/lucene/nutch/nightly/)
+/A << /URI (http://people.apache.org/builds/lucene/nutch/nightly/)
 /S /URI >>
 /H /I
 >>
@@ -106,9 +106,9 @@
 xref
 0 15
 00 65535 f 
-002182 0 n 
-002240 0 n 
-002290 0 n 
+002184 0 n 
+002242 0 n 
+002292 0 n 
 15 0 n 
 71 0 n 
 000959 0 n 
@@ -116,10 +116,10 @@
 001117 0 n 
 001370 0 n 
 001537 0 n 
-001735 0 n 
-001848 0 n 
-001958 0 n 
-002066 0 n 
+001737 0 n 
+001850 0 n 
+001960 0 n 
+002068 0 n 
 trailer
 <<
 /Size 15
@@ -127,5 +127,5 @@
 /Info 4 0 R
 >>
 startxref
-2402
+2404
 %%EOF

Modified: 
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/nightly.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/site/src/documentation/content/xdocs/nightly.xml?view=diff&rev=495716&r1=495715&r2=495716
==
--- lucene/nutch/trunk/src/site/src/documentation/content/xdocs/nightly.xml 
(original)
+++ lucene/nutch/trunk/src/site/src/documentation/content/xdocs/nightly.xml Fri 
Jan 12 12:18:27 2007
@@ -21,7 +21,7 @@
 To report bugs see issue tracking
 
 
-http://people.apache.org/dist/lucene/nutch/nightly/";>Nutch 
nightly builds (0.9-dev)
+http://people.apache.org/builds/lucene/nutch/nightly/";>Nutch 
nightly builds (0.9-dev)
 
   
   




svn commit: r493556 - /lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java

2007-01-06 Thread siren
Author: siren
Date: Sat Jan  6 12:04:03 2007
New Revision: 493556

URL: http://svn.apache.org/viewvc?view=rev&rev=493556
Log:
fix formatting

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java?view=diff&rev=493556&r1=493555&r2=493556
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Sat Jan  6 
12:04:03 2007
@@ -33,53 +33,49 @@
   private URLFilter[] filters;
 
   public URLFilters(Configuration conf) {
-  String order = conf.get(URLFILTER_ORDER);
-  this.filters = (URLFilter[]) conf.getObject(URLFilter.class.getName());
-  
-  if (this.filters == null) {
-String[] orderedFilters = null;
-if (order != null && !order.trim().equals("")) {
-orderedFilters = order.split("\\s+");
-}
+String order = conf.get(URLFILTER_ORDER);
+this.filters = (URLFilter[]) conf.getObject(URLFilter.class.getName());
+
+if (this.filters == null) {
+  String[] orderedFilters = null;
+  if (order != null && !order.trim().equals("")) {
+orderedFilters = order.split("\\s+");
+  }
 
-try {
-ExtensionPoint point = PluginRepository.get(conf)
-.getExtensionPoint(URLFilter.X_POINT_ID);
-if (point == null)
-throw new RuntimeException(URLFilter.X_POINT_ID
-+ " not found.");
-Extension[] extensions = point.getExtensions();
-HashMap filterMap = new HashMap();
-for (int i = 0; i < extensions.length; i++) {
-Extension extension = extensions[i];
-URLFilter filter = (URLFilter) extension
-.getExtensionInstance();
-if (!filterMap.containsKey(filter.getClass().getName())) {
-filterMap.put(filter.getClass().getName(), filter);
-}
-}
-if (orderedFilters == null) {
-conf.setObject(URLFilter.class.getName(), filterMap
-.values().toArray(new URLFilter[0]));
-} else {
-ArrayList filters = new ArrayList();
-for (int i = 0; i < orderedFilters.length; i++) {
-  URLFilter filter = (URLFilter) filterMap
-.get(orderedFilters[i]);
-  if(filter != null){
-filters.add(filter);
-  }
-}
-conf.setObject(URLFilter.class.getName(), 
-filters.toArray(new URLFilter[filters.size()]));
-}
-} catch (PluginRuntimeException e) {
-throw new RuntimeException(e);
+  try {
+ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+URLFilter.X_POINT_ID);
+if (point == null)
+  throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
+Extension[] extensions = point.getExtensions();
+HashMap filterMap = new HashMap();
+for (int i = 0; i < extensions.length; i++) {
+  Extension extension = extensions[i];
+  URLFilter filter = (URLFilter) extension.getExtensionInstance();
+  if (!filterMap.containsKey(filter.getClass().getName())) {
+filterMap.put(filter.getClass().getName(), filter);
+  }
+}
+if (orderedFilters == null) {
+  conf.setObject(URLFilter.class.getName(), filterMap.values().toArray(
+  new URLFilter[0]));
+} else {
+  ArrayList filters = new ArrayList();
+  for (int i = 0; i < orderedFilters.length; i++) {
+URLFilter filter = (URLFilter) filterMap.get(orderedFilters[i]);
+if (filter != null) {
+  filters.add(filter);
 }
-this.filters = (URLFilter[]) conf.getObject(URLFilter.class
-.getName());
+  }
+  conf.setObject(URLFilter.class.getName(), filters
+  .toArray(new URLFilter[filters.size()]));
 }
-  }  
+  } catch (PluginRuntimeException e) {
+throw new RuntimeException(e);
+  }
+  this.filters = (URLFilter[]) conf.getObject(URLFilter.class.getName());
+}
+  }
 
   /** Run all defined filters. Assume logical AND. */
   public String filter(String urlString) throws URLFilterException {




svn commit: r493555 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java

2007-01-06 Thread siren
Author: siren
Date: Sat Jan  6 12:00:48 2007
New Revision: 493555

URL: http://svn.apache.org/viewvc?view=rev&rev=493555
Log:
fix formatting

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diff&rev=493555&r1=493554&r2=493555
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java 
Sat Jan  6 12:00:48 2007
@@ -43,56 +43,69 @@
   private IndexingFilter[] indexingFilters;
 
   public IndexingFilters(Configuration conf) {
-  /* Get indexingfilter.order property */
-  String order = conf.get(INDEXINGFILTER_ORDER);
-  this.indexingFilters =(IndexingFilter[]) 
conf.getObject(IndexingFilter.class.getName()); 
-  if (this.indexingFilters == null) {
-  /* If ordered filters are required, prepare array of filters based 
on property */
-  String[] orderedFilters = null;
-  if (order != null && !order.trim().equals("")) {
-  orderedFilters = order.split("\\s+");
+/* Get indexingfilter.order property */
+String order = conf.get(INDEXINGFILTER_ORDER);
+this.indexingFilters = (IndexingFilter[]) conf
+.getObject(IndexingFilter.class.getName());
+if (this.indexingFilters == null) {
+  /*
+   * If ordered filters are required, prepare array of filters based on
+   * property
+   */
+  String[] orderedFilters = null;
+  if (order != null && !order.trim().equals("")) {
+orderedFilters = order.split("\\s+");
+  }
+  try {
+ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+IndexingFilter.X_POINT_ID);
+if (point == null)
+  throw new RuntimeException(IndexingFilter.X_POINT_ID + " not 
found.");
+Extension[] extensions = point.getExtensions();
+HashMap filterMap = new HashMap();
+for (int i = 0; i < extensions.length; i++) {
+  Extension extension = extensions[i];
+  IndexingFilter filter = (IndexingFilter) extension
+  .getExtensionInstance();
+  if (LOG.isInfoEnabled()) {
+LOG.info("Adding " + filter.getClass().getName());
   }
-try {
-ExtensionPoint point = 
PluginRepository.get(conf).getExtensionPoint(IndexingFilter.X_POINT_ID);
-if (point == null)
-throw new RuntimeException(IndexingFilter.X_POINT_ID + " 
not found.");
-Extension[] extensions = point.getExtensions();
-HashMap filterMap = new HashMap();
-for (int i = 0; i < extensions.length; i++) {
-Extension extension = extensions[i];
-IndexingFilter filter = (IndexingFilter) 
extension.getExtensionInstance();
-if (LOG.isInfoEnabled()) {
-  LOG.info("Adding " + filter.getClass().getName());
-}
-if (!filterMap.containsKey(filter.getClass().getName())) {
-filterMap.put(filter.getClass().getName(), filter);
-}
-}
-/* If no ordered filters required, just get the filters in an 
indeterminate order */
-if (orderedFilters == null) {
-conf.setObject(IndexingFilter.class.getName(), 
(IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0]));
-/* Otherwise run the filters in the required order */
-} else {
-ArrayList filters = new 
ArrayList();
-for (int i = 0; i < orderedFilters.length; i++) {
-IndexingFilter filter = (IndexingFilter) filterMap
-.get(orderedFilters[i]);
-if (filter != null) {
-  filters.add(filter);
-}
-}
-conf.setObject(IndexingFilter.class.getName(), 
filters.toArray(new IndexingFilter[filters.size()]));
-}
-} catch (PluginRuntimeException e) {
-throw new RuntimeException(e);
+  if (!filterMap.containsKey(filter.getClass().getName())) {
+filterMap.put(filter.getClass().getName(), filter);
+  }
+}
+/*
+ * If no ordered filters required, just get the filters in an
+ * indeterminate order
+ */
+if (orderedFilters == null) {
+  conf.setObject(IndexingFilt

svn commit: r493548 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/indexer/IndexingFilters.java src/test/org/apache/nutch/indexer/TestIndexingFilters.java

2007-01-06 Thread siren
Author: siren
Date: Sat Jan  6 11:49:49 2007
New Revision: 493548

URL: http://svn.apache.org/viewvc?view=rev&rev=493548
Log:
fix NUTCH-421

Added:

lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=493548&r1=493547&r2=493548
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Jan  6 11:49:49 2007
@@ -119,6 +119,9 @@
 
 38. NUTCH-325 - UrlFilters.java throws NPE in case urlfilter.order contains
 Filters that are not in plugin.includes (Stefan Groschupf, siren)
+
+39. NUTCH-421 - Allow predeterminate running order of indexing filters
+    (Alan Tanaman, siren)
 
 
 Release 0.8 - 2006-07-25

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=493548&r1=493547&r2=493548
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sat Jan  6 11:49:49 2007
@@ -536,6 +536,24 @@
   
 
 
+
+
+
+  indexingfilter.order
+  
+  The order by which index filters are applied.
+  If empty, all available index filters (as dictated by properties
+  plugin-includes and plugin-excludes above) are loaded and applied in system
+  defined order. If not empty, only named filters are loaded and applied
+  in given order. For example, if this property has value:
+  org.apache.nutch.indexer.basic.BasicIndexingFilter 
org.apache.nutch.indexer.more.MoreIndexingFilter
+  then BasicIndexingFilter is applied first, and MoreIndexingFilter second.
+  
+  Filter ordering might have impact on result if one filter depends on output 
of
+  another filter.
+  
+
+
 
 
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diff&rev=493548&r1=493547&r2=493548
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java 
Sat Jan  6 11:49:49 2007
@@ -17,6 +17,7 @@
 
 package org.apache.nutch.indexer;
 
+import java.util.ArrayList;
 import java.util.HashMap;
 
 // Commons Logging imports
@@ -35,13 +36,22 @@
 /** Creates and caches [EMAIL PROTECTED] IndexingFilter} implementing 
plugins.*/
 public class IndexingFilters {
 
+  public static final String INDEXINGFILTER_ORDER = "indexingfilter.order";
+
   public final static Log LOG = LogFactory.getLog(IndexingFilters.class);
 
   private IndexingFilter[] indexingFilters;
 
   public IndexingFilters(Configuration conf) {
+  /* Get indexingfilter.order property */
+  String order = conf.get(INDEXINGFILTER_ORDER);
   this.indexingFilters =(IndexingFilter[]) 
conf.getObject(IndexingFilter.class.getName()); 
   if (this.indexingFilters == null) {
+  /* If ordered filters are required, prepare array of filters based 
on property */
+  String[] orderedFilters = null;
+  if (order != null && !order.trim().equals("")) {
+  orderedFilters = order.split("\\s+");
+  }
 try {
 ExtensionPoint point = 
PluginRepository.get(conf).getExtensionPoint(IndexingFilter.X_POINT_ID);
 if (point == null)
@@ -58,7 +68,21 @@
 filterMap.put(filter.getClass().getName(), filter);
 }
 }
-conf.setObject(IndexingFilter.class.getName(), 
(IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0]));
+/* If no ordered filters required, just get the filters in an 
indeterminate order */
+if (orderedFilters == null) {
+conf.setObject(IndexingFilter.class.getName(), 
(IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0]));
+/* Otherwise run the filters in the required order */
+} else {
+ArrayList filters = new 
ArrayList();
+for (int i = 0; i < orderedFilters.length; i++) {
+IndexingFilter filter = (IndexingFilter) filterMap
+.get(orderedFilters[i]);
+if (filter != null) {
+  filters.add(filter);
+}
+}
+c

svn commit: r493438 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/net/URLFilters.java src/test/org/apache/nutch/net/TestURLFilters.java

2007-01-06 Thread siren
Author: siren
Date: Sat Jan  6 01:39:20 2007
New Revision: 493438

URL: http://svn.apache.org/viewvc?view=rev&rev=493438
Log:
Fix NUTCH-325

Added:
lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=493438&r1=493437&r2=493438
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Jan  6 01:39:20 2007
@@ -117,6 +117,9 @@
 37. NUTCH-425, NUTCH-426 - Fix anchors pollution. Continue after
 skipping bad URLs. (Michael Stack via ab)
 
+38. NUTCH-325 - UrlFilters.java throws NPE in case urlfilter.order contains
+Filters that are not in plugin.includes (Stefan Groschupf, siren)
+
 
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java?view=diff&rev=493438&r1=493437&r2=493438
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Sat Jan  6 
01:39:20 2007
@@ -17,6 +17,7 @@
 
 package org.apache.nutch.net;
 
+import java.util.ArrayList;
 import java.util.HashMap;
 
 import org.apache.nutch.plugin.Extension;
@@ -28,10 +29,11 @@
 /** Creates and caches [EMAIL PROTECTED] URLFilter} implementing plugins.*/
 public class URLFilters {
 
+  public static final String URLFILTER_ORDER = "urlfilter.order";
   private URLFilter[] filters;
 
   public URLFilters(Configuration conf) {
-  String order = conf.get("urlfilter.order");
+  String order = conf.get(URLFILTER_ORDER);
   this.filters = (URLFilter[]) conf.getObject(URLFilter.class.getName());
   
   if (this.filters == null) {
@@ -60,12 +62,16 @@
 conf.setObject(URLFilter.class.getName(), filterMap
 .values().toArray(new URLFilter[0]));
 } else {
-URLFilter[] filter = new URLFilter[orderedFilters.length];
+ArrayList filters = new ArrayList();
 for (int i = 0; i < orderedFilters.length; i++) {
-filter[i] = (URLFilter) filterMap
+  URLFilter filter = (URLFilter) filterMap
 .get(orderedFilters[i]);
+  if(filter != null){
+filters.add(filter);
+  }
 }
-conf.setObject(URLFilter.class.getName(), filter);
+conf.setObject(URLFilter.class.getName(), 
+filters.toArray(new URLFilter[filters.size()]));
 }
 } catch (PluginRuntimeException e) {
 throw new RuntimeException(e);

Added: lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java?view=auto&rev=493438
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java Sat 
Jan  6 01:39:20 2007
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestURLFilters extends TestCase {
+
+  /**
+   * Testcase for NUTCH-325.
+   * @throws URLFilterException
+   */
+  public void testNonExistingUrlFilter() throws URLFilterException {
+Configuration conf = NutchConfiguration.create();
+String class1 = "NonExistingFilter";
+ 

svn commit: r493159 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

2007-01-05 Thread siren
Author: siren
Date: Fri Jan  5 11:43:48 2007
New Revision: 493159

URL: http://svn.apache.org/viewvc?view=rev&rev=493159
Log:
reuse existing code in DecreasingFloatComparator

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=493159&r1=493158&r2=493159
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Jan  
5 11:43:48 2007
@@ -248,20 +248,15 @@
 
   }
 
-  public static class DecreasingFloatComparator extends WritableComparator {
-
-public DecreasingFloatComparator() {
-  super(FloatWritable.class);
-}
+  public static class DecreasingFloatComparator extends 
FloatWritable.Comparator {
 
 /** Compares two FloatWritables decreasing. */
-public int compare(WritableComparable o1, WritableComparable o2) {
-  float thisValue = ((FloatWritable) o1).get();
-  float thatValue = ((FloatWritable) o2).get();
-  return (thisValue

svn commit: r493114 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

2007-01-05 Thread siren
Author: siren
Date: Fri Jan  5 10:17:01 2007
New Revision: 493114

URL: http://svn.apache.org/viewvc?view=rev&rev=493114
Log:
minor mod to javadoc

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=493114&r1=493113&r2=493114
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Jan  
5 10:17:01 2007
@@ -353,7 +353,10 @@
 .currentTimeMillis(), true, false);
   }
 
-  /** Generate fetchlists in a segment. */
+  /**
+   * Generate fetchlists in a segment.
+   * @return Path to generated segment or null if no entries were selected.
+   * */
   public Path generate(Path dbDir, Path segments,
int numLists, long topN, long curTime, boolean filter,
boolean force)




svn commit: r485076 - in /lucene/nutch/trunk/src: java/org/apache/nutch/metadata/SpellCheckedMetadata.java test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java

2006-12-09 Thread siren
Author: siren
Date: Sat Dec  9 14:27:07 2006
New Revision: 485076

URL: http://svn.apache.org/viewvc?view=rev&rev=485076
Log:
Optimize SpellCheckedMetadata further by taking into account the fact that it 
is used only for http-headers.

I am starting to believe that spellchecking should just be an utility method 
used by http protocol plugins.

Modified:

lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java

lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java?view=diff&rev=485076&r1=485075&r2=485076
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java 
Sat Dec  9 14:27:07 2006
@@ -25,10 +25,9 @@
 
 /**
  * A decorator to Metadata that adds spellchecking capabilities to property
- * names.
- *
- * All the static String fields declared by this class are used as reference
- * names for syntax correction on meta-data naming.
+ * names. Currently used spelling vocabulary contains just the httpheaders from
+ * [EMAIL PROTECTED] HttpHeaders} class.
+ * 
  */
 public class SpellCheckedMetadata extends Metadata {
 
@@ -49,18 +48,23 @@
*/
   private static String[] normalized = null;
 
-  // Uses self introspection to fill the metanames index and the
-  // metanames list.
   static {
-for (Field field : SpellCheckedMetadata.class.getFields()) {
-  int mods = field.getModifiers();
-  if (Modifier.isFinal(mods) && Modifier.isPublic(mods)
-  && Modifier.isStatic(mods) && field.getType().equals(String.class)) {
-try {
-  String val = (String) field.get(null);
-  NAMES_IDX.put(normalize(val), val);
-} catch (Exception e) {
-  // Simply ignore...
+
+// Uses following array to fill the metanames index and the
+// metanames list.
+Class[] spellthese = {HttpHeaders.class};
+
+for (Class spellCheckedNames : spellthese) {
+  for (Field field : spellCheckedNames.getFields()) {
+int mods = field.getModifiers();
+if (Modifier.isFinal(mods) && Modifier.isPublic(mods)
+&& Modifier.isStatic(mods) && 
field.getType().equals(String.class)) {
+  try {
+String val = (String) field.get(null);
+NAMES_IDX.put(normalize(val), val);
+  } catch (Exception e) {
+// Simply ignore...
+  }
 }
   }
 }
@@ -125,8 +129,7 @@
 
   @Override
   public void add(final String name, final String value) {
-String normalized = getNormalizedName(name);
-super.add(normalized, value);
+super.add(getNormalizedName(name), value);
   }
 
   @Override

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java?view=diff&rev=485076&r1=485075&r2=485076
==
--- 
lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
 (original)
+++ 
lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
 Sat Dec  9 14:27:07 2006
@@ -36,6 +36,8 @@
  */
 public class TestSpellCheckedMetadata extends TestCase {
 
+  private static final int NUM_ITERATIONS = 1;
+
   public TestSpellCheckedMetadata(String testName) {
 super(testName);
   }
@@ -63,7 +65,7 @@
 assertEquals("Content-Type", SpellCheckedMetadata
 .getNormalizedName("contntype"));
   }
-
+  
   /** Test for the add(String, String) method. */
   public void testAdd() {
 String[] values = null;
@@ -237,18 +239,35 @@
 assertEquals(0, result.size());
 meta.add("name-one", "value-1.1");
 result = writeRead(meta);
+meta.add("Contenttype", "text/html");
 assertEquals(1, result.size());
 assertEquals(1, result.getValues("name-one").length);
 assertEquals("value-1.1", result.get("name-one"));
 meta.add("name-two", "value-2.1");
 meta.add("name-two", "value-2.2");
 result = writeRead(meta);
-assertEquals(2, result.size());
+assertEquals(3, result.size());
 assertEquals(1, result.getValues("name-one").length);
 assertEquals("value-1.1", result.getValues("name-one")[0]);
 assertEquals(2, result.getValues("name-two").length);
 assertEquals("value-2.1"

svn commit: r485072 - in /lucene/nutch/trunk/src/test/org/apache/nutch: crawl/TestLinkDbMerger.java indexer/TestDeleteDuplicates.java

2006-12-09 Thread siren
Author: siren
Date: Sat Dec  9 14:20:02 2006
New Revision: 485072

URL: http://svn.apache.org/viewvc?view=rev&rev=485072
Log:
generate test resources under build/test

Modified:
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java

lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java?view=diff&rev=485072&r1=485071&r2=485072
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java 
Sat Dec  9 14:20:02 2006
@@ -87,7 +87,7 @@
 expected.put(url21, urls21_expected);
 conf = NutchConfiguration.create();
 fs = FileSystem.get(conf);
-testDir = new Path("test-crawldb-" +
+testDir = new Path("build/test/test-linkdb-" +
 new java.util.Random().nextInt());
 fs.mkdirs(testDir);
   }
@@ -105,8 +105,6 @@
   public void testMerge() throws Exception {
 Configuration conf = NutchConfiguration.create();
 FileSystem fs = FileSystem.get(conf);
-Path testDir = new Path("test-linkdb-" +
-new java.util.Random().nextInt());
 fs.mkdirs(testDir);
 Path linkdb1 = new Path(testDir, "linkdb1");
 Path linkdb2 = new Path(testDir, "linkdb2");

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?view=diff&rev=485072&r1=485071&r2=485072
==
--- 
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java 
(original)
+++ 
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java 
Sat Dec  9 14:20:02 2006
@@ -31,7 +31,6 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.util.NutchConfiguration;
 
 import junit.framework.TestCase;
@@ -47,7 +46,7 @@
 conf = NutchConfiguration.create();
 conf.set("fs.default.name", "local");
 fs = FileSystem.get(conf);
-root = new Path("dedup2-test-" + new Random().nextInt());
+root = new Path("build/test/dedup2-test-" + new Random().nextInt());
 // create test indexes
 index1 = createIndex("index1", true, 1.0f, 10L);
 index2 = createIndex("index2", false, 2.0f, 20L);




svn commit: r481738 - in /lucene/nutch/trunk/contrib/web2: README.txt build.xml

2006-12-03 Thread siren
Author: siren
Date: Sun Dec  3 00:49:48 2006
New Revision: 481738

URL: http://svn.apache.org/viewvc?view=rev&rev=481738
Log:
simplify compilation process

Modified:
lucene/nutch/trunk/contrib/web2/README.txt
lucene/nutch/trunk/contrib/web2/build.xml

Modified: lucene/nutch/trunk/contrib/web2/README.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/README.txt?view=diff&rev=481738&r1=481737&r2=481738
==
--- lucene/nutch/trunk/contrib/web2/README.txt (original)
+++ lucene/nutch/trunk/contrib/web2/README.txt Sun Dec  3 00:49:48 2006
@@ -23,34 +23,13 @@
 controllers and pojos, jar libraries), ui markup
 (in form of html, jsp), ui resources css, javascript.
 
-Before compiling core nutch plugins you must edit the
-core nutch plugin 'nutch-extensionpoints' plugin.xml
-and add following snippet into it:
-
-
-
-
-
-
-
-
-
-To compile you need to fist build your nutch (core and plugins)
-after that run ant war to generate war.
-
 To compile web2 plugins you must issue command
 ant compile-plugins
 
 After compiling you must enable plugins, please refer to nutch
-documentation 
+documentation
+
+To build deployable .war issue command ant war.
 
 The nutch plugins are not included in the generated war and you
 need to properly configure where your plugins are. This is achieved
@@ -125,23 +104,21 @@
 
 absolute path, must start with /WEB-INF
 
-   
-
-   
-   
+
+  
+  
+
 
 Referencing jsp resources inside plugins
 
 absolute path, must start with /plugin/
 
-
-   
-
-   
-   
-
+
+  
+  
+
 
 Static resources
 

Modified: lucene/nutch/trunk/contrib/web2/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/build.xml?view=diff&rev=481738&r1=481737&r2=481738
==
--- lucene/nutch/trunk/contrib/web2/build.xml (original)
+++ lucene/nutch/trunk/contrib/web2/build.xml Sun Dec  3 00:49:48 2006
@@ -79,7 +79,7 @@
   
-  
+  
 
 
   
 
+  
+  
+
+
+  
+  
+
+
+  
+
   
@@ -127,7 +140,8 @@
 
   
 
-  
+  
 
   




svn commit: r481445 - in /lucene/nutch/trunk: site/ site/images/ src/site/src/documentation/ src/site/src/documentation/content/xdocs/ src/site/src/documentation/resources/images/

2006-12-01 Thread siren
Author: siren
Date: Fri Dec  1 15:25:49 2006
New Revision: 481445

URL: http://svn.apache.org/viewvc?view=rev&rev=481445
Log:
added lucene logo so site is independent, added link to solr, added link to my 
apache page in credits

Added:
lucene/nutch/trunk/site/images/lucene_green_150.gif   (with props)

lucene/nutch/trunk/src/site/src/documentation/resources/images/lucene_green_150.gif
   (with props)
Modified:
lucene/nutch/trunk/site/about.html
lucene/nutch/trunk/site/bot.html
lucene/nutch/trunk/site/credits.html
lucene/nutch/trunk/site/credits.pdf
lucene/nutch/trunk/site/i18n.html
lucene/nutch/trunk/site/index.html
lucene/nutch/trunk/site/issue_tracking.html
lucene/nutch/trunk/site/linkmap.html
lucene/nutch/trunk/site/linkmap.pdf
lucene/nutch/trunk/site/mailing_lists.html
lucene/nutch/trunk/site/nightly.html
lucene/nutch/trunk/site/tutorial.html
lucene/nutch/trunk/site/tutorial8.html
lucene/nutch/trunk/site/version_control.html
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/credits.xml
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/site.xml
lucene/nutch/trunk/src/site/src/documentation/skinconf.xml

Modified: lucene/nutch/trunk/site/about.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/about.html?view=diff&rev=481445&r1=481444&r2=481445
==
--- lucene/nutch/trunk/site/about.html (original)
+++ lucene/nutch/trunk/site/about.html Fri Dec  1 15:25:49 2006
@@ -21,7 +21,7 @@
 
 
 
-http://lucene.apache.org/";>http://lucene.apache.org/java/docs/images/lucene_green_150.gif"; 
title="Apache Lucene">
+http://lucene.apache.org/";>
 
 
 http://lucene.apache.org/nutch/";>
@@ -127,6 +127,9 @@
 
 
 http://lucene.apache.org/hadoop/";>Hadoop
+
+
+http://incubator.apache.org/solr/";>Solr
 
 
 

Modified: lucene/nutch/trunk/site/bot.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/bot.html?view=diff&rev=481445&r1=481444&r2=481445
==
--- lucene/nutch/trunk/site/bot.html (original)
+++ lucene/nutch/trunk/site/bot.html Fri Dec  1 15:25:49 2006
@@ -21,7 +21,7 @@
 
 
 
-http://lucene.apache.org/";>http://lucene.apache.org/java/docs/images/lucene_green_150.gif"; 
title="Apache Lucene">
+http://lucene.apache.org/";>
 
 
 http://lucene.apache.org/nutch/";>
@@ -127,6 +127,9 @@
 
 
 http://lucene.apache.org/hadoop/";>Hadoop
+
+
+http://incubator.apache.org/solr/";>Solr
 
 
 

Modified: lucene/nutch/trunk/site/credits.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.html?view=diff&rev=481445&r1=481444&r2=481445
==
--- lucene/nutch/trunk/site/credits.html (original)
+++ lucene/nutch/trunk/site/credits.html Fri Dec  1 15:25:49 2006
@@ -21,7 +21,7 @@
 
 
 
-http://lucene.apache.org/";>http://lucene.apache.org/java/docs/images/lucene_green_150.gif"; 
title="Apache Lucene">
+http://lucene.apache.org/";>
 
 
 http://lucene.apache.org/nutch/";>
@@ -128,6 +128,9 @@
 
 http://lucene.apache.org/hadoop/";>Hadoop
 
+
+http://incubator.apache.org/solr/";>Solr
+
 
 
 
@@ -178,7 +181,9 @@
 http://www-scf.usc.edu/~mattmann/";>Chris A. Mattmann
 
   
-Sami Siren
+
+http://people.apache.org/~siren";>Sami Siren
+
   
 John Xing
 
@@ -186,7 +191,7 @@
 
 
 
-
+
 Friends
 
 
@@ -217,7 +222,7 @@
 
 
 
-
+
 Sponsors
 
 

Modified: lucene/nutch/trunk/site/credits.pdf
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.pdf?view=diff&rev=481445&r1=481444&r2=481445
==
--- lucene/nutch/trunk/site/credits.pdf (original)
+++ lucene/nutch/trunk/site/credits.pdf Fri Dec  1 15:25:49 2006
@@ -58,10 +58,10 @@
 >>
 endobj
 14 0 obj
-<< /Length 2280 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 2321 /Filter [ /ASCII85Decode /FlateDecode ]
  >>
 stream
-Gat=-b?![^&Dd([EMAIL 
PROTECTED](g>8rh)98bLREO<8q+kCLYQbWS$l/LH`\btQ/n-u`VVIQ&afn\c,1ap*Z)a&i/`+[!+V0JT.oNcCqRNCM#Z%Q7thgF3mBRPUaU?!II$k!h0`E(I'[EMAIL
 
PROTECTED]&#JoCC%72pi_(rVhVI2C`JNP"*haZD!Kkq`97s3h<<8`I.lInD8LM\Y<,#I2-"X][4IAapZS@>EZ&mT8Nm8R4TNIZLNET"RF\SWHFbVa,^l2jZjPgmO:>\.ABf1`A4>ACR*XGDan"e0]76F[&WL?,LIXdHp1WR8+Q1M[.m46hgr<`nCaWJFa=We`'pqfds69Y;Sk_f>&\13ol[o=g:4aiE]qFqj^WB`92G%3f&kP"Q*u'ii4[[,h(DtEpcP473CP7g/5LB!FOs_<;5i$peluanbc&!5Ucp$oJ/:$XWp'jS`97BNiWMF#*k=TJ>gFf'+QbGL:@e
 DFBI62ZBS9&U-_*&[EMAIL 
PROTECTED],02,gK0kj-iV,*#$A8UroM-[Zj7fp&V9[aI

svn commit: r481437 [2/2] - in /lucene/nutch/trunk/contrib/web2: ./ plugins/ plugins/web-caching-oscache/ plugins/web-clustering/ plugins/web-clustering/src/web/web-clustering/ plugins/web-keymatch/ p

2006-12-01 Thread siren
Modified: 
lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_ca.properties
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_ca.properties?view=diff&rev=481437&r1=481436&r2=481437
==
--- 
lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_ca.properties
 (original)
+++ 
lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_ca.properties
 Fri Dec  1 15:01:00 2006
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 search.title = resultats de cerca
 search.search = Cerca
 search.hits = Coincidències {0}-{1} (d'un total de {2} documents 
coincidents):

Modified: 
lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_de.properties
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_de.properties?view=diff&rev=481437&r1=481436&r2=481437
==
--- 
lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_de.properties
 (original)
+++ 
lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_de.properties
 Fri Dec  1 15:01:00 2006
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 lang=de
 anchors.title = Verweise 
 anchors.anchors = Herzeigende Link-Texte:

Modified: 
lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_en.properties
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_en.properties?view=diff&rev=481437&r1=481436&r2=481437
==
--- 
lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_en.properties
 (original)
+++ 
lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_en.properties
 Fri Dec  1 15:01:00 2006
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #This is the default resource file for nutch ui localization.
 #If you create a new localized version of resources, please use
 #this as the base
@@ -96,4 +111,4 @@
 preferences.numResults.info=
 
 #text on save button
-preferences.submit=Save and return to search
\ No newline at end of file
+preferences.submit=Save and return to search

Modified: 
lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_es.properties
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_es.properties?view=diff&rev=481437&r1=481436&r2=481437
===

svn commit: r477786 - in /lucene/nutch/trunk: CHANGES.txt conf/parse-plugins.xml

2006-11-21 Thread siren
Author: siren
Date: Tue Nov 21 09:51:57 2006
New Revision: 477786

URL: http://svn.apache.org/viewvc?view=rev&rev=477786
Log:
NUTCH-362

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/parse-plugins.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=477786&r1=477785&r2=477786
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Nov 21 09:51:57 2006
@@ -83,6 +83,9 @@
 27. NUTCH-405 - Content object is not properly initialized in map method
 of ParseSegment (siren)
 
+28. NUTCH-362 - Remove parse-text from unsupported filetypes in
+parse-plugins.xml (siren)
+
 Release 0.8 - 2006-07-25
 
  0. Totally new architecture, based on hadoop

Modified: lucene/nutch/trunk/conf/parse-plugins.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/parse-plugins.xml?view=diff&rev=477786&r1=477785&r2=477786
==
--- lucene/nutch/trunk/conf/parse-plugins.xml (original)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Nov 21 09:51:57 2006
@@ -22,16 +22,6 @@
 
 
 
-   
-   
-   
-   
-
-   
-   
-   
-



@@ -46,7 +36,6 @@
 


-   

 

@@ -113,20 +102,8 @@


 
-   
-   
-   
-
-   
-   
-   
-
-   
-   
-   
-

-   
+   

 

@@ -145,7 +122,6 @@
 


-   

 

@@ -158,66 +134,14 @@


 
-   
-   
-   
-
-   
-   
-   
-
-   
-   
-   
-



 
-   
-   
-   
-
-   
-   
-   
-
-   
-   
-   
-
-   
-   
-   
-
-   
-   
-   
-
-   
-   
-   
-
-   
-   
-   
-



 
-   
-   
-   
-
-   
-   
-   
-
-   
-   
-   
-



@@ -238,30 +162,15 @@
 


-   

 


-   
-   
-
-   
-   
-   
-
-   
-   

 

-   


-   
-
-   
-   

 





svn commit: r477757 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/parse/ParseSegment.java src/java/org/apache/nutch/protocol/Content.java

2006-11-21 Thread siren
Author: siren
Date: Tue Nov 21 09:19:51 2006
New Revision: 477757

URL: http://svn.apache.org/viewvc?view=rev&rev=477757
Log:
NUTCH-405

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=477757&r1=477756&r2=477757
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Nov 21 09:19:51 2006
@@ -80,6 +80,9 @@
 
 26. NUTCH-403 - Make URL filtering optional in Generator (siren)
 
+27. NUTCH-405 - Content object is not properly initialized in map method
+of ParseSegment (siren)
+
 Release 0.8 - 2006-07-25
 
  0. Totally new architecture, based on hadoop

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?view=diff&rev=477757&r1=477756&r2=477757
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue 
Nov 21 09:19:51 2006
@@ -66,7 +66,8 @@
   newKey.set(key.toString());
   key = newKey;
 }
-Content content = (Content)value;
+Content content = (Content) value;
+content.forceInflate();
 
 Parse parse = null;
 ParseStatus status;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?view=diff&rev=477757&r1=477756&r2=477757
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Tue Nov 
21 09:19:51 2006
@@ -298,4 +298,12 @@
 return typeName;
   }
 
+  /**
+   * By calling this method caller forces the next access to any property (via
+   * getters and setters) to check if decompressing of data is really required.
+   */
+  public void forceInflate() {
+inflated = false;
+  }
+
 }




svn commit: r476879 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/test/ src/test/org/apache/nutch/crawl/ src/test/org/apache/nutch/fetcher/

2006-11-19 Thread siren
Author: siren
Date: Sun Nov 19 10:48:39 2006
New Revision: 476879

URL: http://svn.apache.org/viewvc?view=rev&rev=476879
Log:
NUTCH-403 Make URL filtering optional in Generator

Added:
lucene/nutch/trunk/src/test/filter-all.txt
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=476879&r1=476878&r2=476879
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Nov 19 10:48:39 2006
@@ -78,6 +78,8 @@
 
 25. NUTCH-404 - Fix LinkDB Usage - implementation mismatch (siren)
 
+26. NUTCH-403 - Make URL filtering optional in Generator (siren)
+
 Release 0.8 - 2006-07-25
 
  0. Totally new architecture, based on hadoop

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=476879&r1=476878&r2=476879
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Sun Nov 19 
10:48:39 2006
@@ -115,9 +115,8 @@
 injector.inject(crawlDb, rootUrlDir);
   
 for (int i = 0; i < depth; i++) { // generate new segment
-  Path segment =
-generator.generate(crawlDb, segments, -1,
- topN, System.currentTimeMillis());
+  Path segment = generator.generate(crawlDb, segments, -1, topN, System
+  .currentTimeMillis(), false);
   fetcher.fetch(segment, threads);  // fetch it
   if (!Fetcher.isParsing(job)) {
 parseSegment.parse(segment);// parse it, if needed

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=476879&r1=476878&r2=476879
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Sun Nov 
19 10:48:39 2006
@@ -44,6 +44,7 @@
 /** Generates a subset of a crawl db to fetch. */
 public class Generator extends ToolBase {
 
+  public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter";
   public static final String GENERATE_MAX_PER_HOST_BY_IP = 
"generate.max.per.host.by.ip";
   public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";
   public static final String CRAWL_TOP_N = "crawl.topN";
@@ -89,6 +90,7 @@
 private FloatWritable sortValue = new FloatWritable();
 private boolean byIP;
 private long dnsFailure = 0L;
+private boolean filter;
 
 public void configure(JobConf job) {
   curTime = job.getLong(CRAWL_GEN_CUR_TIME, System.currentTimeMillis());
@@ -99,6 +101,7 @@
   normalizers = new URLNormalizers(job, 
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
   scfilters = new ScoringFilters(job);
   hostPartitioner.configure(job);
+  filter = job.getBoolean(CRAWL_GENERATE_FILTER, true);
 }
 
 public void close() {}
@@ -108,13 +111,16 @@
 OutputCollector output, Reporter reporter)
   throws IOException {
   Text url = (Text)key;
-  // don't generate URLs that don't pass URLFilters
-  try {
-if (filters.filter(url.toString()) == null)
-  return;
-  } catch (URLFilterException e) {
-if (LOG.isWarnEnabled()) {
-  LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + 
")");
+  if (filter) {
+// If filtering is on don't generate URLs that don't pass URLFilters
+try {
+  if (filters.filter(url.toString()) == null)
+return;
+} catch (URLFilterException e) {
+  if (LOG.isWarnEnabled()) {
+LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage()
++ ")");
+  }
 }
   }
   CrawlDatum crawlDatum = (CrawlDatum)value;
@@ -291,13 +297,13 @@
   /** Generate fetchlists in a segment. */
   public Path generate(Path dbDir, Path segments)
 throws IOException {
-return generate(dbDir, segments,
--1, Long.MAX_VALUE, System.currentTimeMillis());
+return generate(dbDir, se

svn commit: r476814 - /lucene/nutch/trunk/CHANGES.txt

2006-11-19 Thread siren
Author: siren
Date: Sun Nov 19 05:13:54 2006
New Revision: 476814

URL: http://svn.apache.org/viewvc?view=rev&rev=476814
Log:
NUTCH-404 Fix LinkDB Usage - implementation mismatch

Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=476814&r1=476813&r2=476814
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Nov 19 05:13:54 2006
@@ -76,6 +76,8 @@
 24. NUTCH-388 - nutch-default.xml has outdated example for urlfilter.order
 (reported by Jared Dunne)
 
+25. NUTCH-404 - Fix LinkDB Usage - implementation mismatch (siren)
+
 Release 0.8 - 2006-07-25
 
  0. Totally new architecture, based on hadoop




svn commit: r476810 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java

2006-11-19 Thread siren
Author: siren
Date: Sun Nov 19 04:54:29 2006
New Revision: 476810

URL: http://svn.apache.org/viewvc?view=rev&rev=476810
Log:
NUTCH-404 Fix LinkDB Usage - implementation mismatch

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?view=diff&rev=476810&r1=476809&r2=476810
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Sun Nov 19 
04:54:29 2006
@@ -329,12 +329,12 @@
   
   public int run(String[] args) throws Exception {
 if (args.length < 2) {
-  System.err.println("Usage: LinkDb  (-dir  |  
 ...) [-noNormalizing] [-noFiltering]");
+  System.err.println("Usage: LinkDb  (-dir  |  
 ...) [-noNormalize] [-noFilter]");
   System.err.println("\tlinkdb\toutput LinkDb to create or update");
   System.err.println("\t-dir segmentsDir\tparent directory of several 
segments, OR");
   System.err.println("\tseg1 seg2 ...\t list of segment directories");
-  System.err.println("\t-noNormalizing\tdon't normalize link URLs");
-  System.err.println("\t-noFiltering\tdon't apply URLFilters to link 
URLs");
+  System.err.println("\t-noNormalize\tdon't normalize link URLs");
+  System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
   return -1;
 }
 Path segDir = null;
@@ -370,7 +370,5 @@
   return -1;
 }
   }
-
-
 
 }




svn commit: r476617 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml

2006-11-18 Thread siren
Author: siren
Date: Sat Nov 18 13:55:44 2006
New Revision: 476617

URL: http://svn.apache.org/viewvc?view=rev&rev=476617
Log:
NUTCH-388 Fix description of urlfilter.order

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=476617&r1=476616&r2=476617
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Nov 18 13:55:44 2006
@@ -73,6 +73,9 @@
 
 23. NUTCH-395 - Increase fetching speed (siren)
 
+24. NUTCH-388 - nutch-default.xml has outdated example for urlfilter.order
+(reported by Jared Dunne)
+
 Release 0.8 - 2006-07-25
 
  0. Totally new architecture, based on hadoop

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=476617&r1=476616&r2=476617
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sat Nov 18 13:55:44 2006
@@ -780,7 +780,7 @@
   plugin-includes and plugin-excludes above) are loaded and applied in system
   defined order. If not empty, only named filters are loaded and applied
   in given order. For example, if this property has value:
-  org.apache.nutch.net.RegexURLFilter org.apache.nutch.net.PrefixURLFilter
+  org.apache.nutch.urlfilter.regex.RegexURLFilter 
org.apache.nutch.urlfilter.prefix.PrefixURLFilter
   then RegexURLFilter is applied first, and PrefixURLFilter second.
   Since all filters are AND'ed, filter ordering does not have impact
   on end result, but it may have performance implication, depending




svn commit: r475382 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

2006-11-15 Thread siren
Author: siren
Date: Wed Nov 15 11:49:55 2006
New Revision: 475382

URL: http://svn.apache.org/viewvc?view=rev&rev=475382
Log:
oh well

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=475382&r1=475381&r2=475382
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Nov 
15 11:49:55 2006
@@ -91,10 +91,10 @@
 private long dnsFailure = 0L;
 
 public void configure(JobConf job) {
-  curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis());
-  limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks();
-  maxPerHost = job.getInt("generate.max.per.host", -1);
-  byIP = job.getBoolean("generate.max.per.host.by.ip", false);
+  curTime = job.getLong(CRAWL_GEN_CUR_TIME, System.currentTimeMillis());
+  limit = job.getLong(CRAWL_TOP_N,Long.MAX_VALUE)/job.getNumReduceTasks();
+  maxPerHost = job.getInt(GENERATE_MAX_PER_HOST, -1);
+  byIP = job.getBoolean(GENERATE_MAX_PER_HOST_BY_IP, false);
   filters = new URLFilters(job);
   normalizers = new URLNormalizers(job, 
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
   scfilters = new ScoringFilters(job);




svn commit: r475380 - /lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java

2006-11-15 Thread siren
Author: siren
Date: Wed Nov 15 11:43:48 2006
New Revision: 475380

URL: http://svn.apache.org/viewvc?view=rev&rev=475380
Log:
added more junit tests

Modified:
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=475380&r1=475379&r2=475380
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Wed 
Nov 15 11:43:48 2006
@@ -182,7 +182,8 @@
   }
   
   /**
-   * Test that generator obeys the property "generate.max.per.host".
+   * Test that generator obeys the property "generate.max.per.host" and
+   * "generate.max.per.host.by.ip".
* @throws Exception 
*/
   public void testGenerateHostIPLimit() throws Exception{




svn commit: r475378 - in /lucene/nutch/trunk/src: java/org/apache/nutch/crawl/Generator.java test/org/apache/nutch/crawl/TestGenerator.java

2006-11-15 Thread siren
Author: siren
Date: Wed Nov 15 11:42:22 2006
New Revision: 475378

URL: http://svn.apache.org/viewvc?view=rev&rev=475378
Log:
added more junit tests

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=475378&r1=475377&r2=475378
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Nov 
15 11:42:22 2006
@@ -44,6 +44,10 @@
 /** Generates a subset of a crawl db to fetch. */
 public class Generator extends ToolBase {
 
+  public static final String GENERATE_MAX_PER_HOST_BY_IP = 
"generate.max.per.host.by.ip";
+  public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";
+  public static final String CRAWL_TOP_N = "crawl.topN";
+  public static final String CRAWL_GEN_CUR_TIME = "crawl.gen.curTime";
   public static final Log LOG = LogFactory.getLog(Generator.class);
   
   public static class SelectorEntry implements Writable {

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=475378&r1=475377&r2=475378
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Wed 
Nov 15 11:42:22 2006
@@ -31,12 +31,10 @@
 import junit.framework.TestCase;
 
 /**
- * Basic generator test:
- * 1. Insert entries in crawldb
- * 2. Generates entries to fetch
- * 3. Verifies that number of generated urls match
- * 4. Verifies that highest scoring urls are generated 
- 
+ * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
+ * fetch 3. Verifies that number of generated urls match 4. Verifies that
+ * highest scoring urls are generated
+ *
  * @author nutch-dev 
  *
  */
@@ -50,11 +48,11 @@
 
   FileSystem fs;
 
-  final static Path testdir=new Path("build/test/generator-test");
+  final static Path testdir = new Path("build/test/generator-test");
 
   protected void setUp() throws Exception {
 conf = CrawlDBTestUtil.createConfiguration();
-fs=FileSystem.get(conf);
+fs = FileSystem.get(conf);
 fs.delete(testdir);
   }
 
@@ -70,81 +68,243 @@
   }
 
   /**
-   * Test that generator generates fetchlish ordered by score (desc)
-   * 
+   * Test that generator generates fetchlish ordered by score (desc).
+   *
* @throws Exception
*/
   public void testGenerateHighest() throws Exception {
 
-int NUM_RESULTS=2;
- 
+final int NUM_RESULTS = 2;
+
 ArrayList list = new ArrayList();
-
-for(int i=0;i<=100;i++){
-  list.add(new CrawlDBTestUtil.URLCrawlDatum(new Text("http://aaa/"; + 
pad(i)),
-new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 1, i)));
+
+for (int i = 0; i <= 100; i++) {
+  list.add(createURLCrawlDatum("http://aaa/"; + pad(i),
+  1, i));
 }
-
-dbDir = new Path(testdir, "crawldb");
-segmentsDir = new Path(testdir, "segments");
-fs.mkdirs(dbDir);
-fs.mkdirs(segmentsDir);
-
-// create crawldb
-CrawlDBTestUtil.createCrawlDb(fs, dbDir, list);
-
-// generate segment
-Generator g=new Generator(conf);
-Path generatedSegment=g.generate(dbDir, segmentsDir, -1, NUM_RESULTS, 
Long.MAX_VALUE);
-
-Path fetchlist=new Path(new Path(generatedSegment, 
CrawlDatum.GENERATE_DIR_NAME), "part-0");
-
-// verify results
-SequenceFile.Reader reader=new SequenceFile.Reader(fs, fetchlist, conf);
-
-ArrayList l=new ArrayList();
-
-READ:
-  do {
-  Text key=new Text();
-  CrawlDatum value=new CrawlDatum();
-  if(!reader.next(key, value)) break READ;
-  l.add(new URLCrawlDatum(key, value));
-} while(true);
 
-reader.close();
+createCrawlDB(list);
 
+Path generatedSegment = generateFetchlist(NUM_RESULTS, conf);
+
+Path fetchlist = new Path(new Path(generatedSegment,
+CrawlDatum.GENERATE_DIR_NAME), "part-0");
+
+ArrayList l = readContents(fetchlist);
+
 // sort urls by score desc
 Collections.sort(l, new ScoreComparator());
 
-//verify we got right amount of records
+// verify we got right amount of records
 assertEquals(NUM_RESULTS, l.size());
 
-//verify we have the highest scoring urls
+// verify we have the highest scoring urls

svn commit: r474464 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/protocol/ src/test/org/apache/nutch/metadata/ src/test/org/apache/nutch/protocol/

2006-11-13 Thread siren
Author: siren
Date: Mon Nov 13 11:46:56 2006
New Revision: 474464

URL: http://svn.apache.org/viewvc?view=rev&rev=474464
Log:
NUTCH-395 Increase fetching speed

Added:

lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java

lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=474464&r1=474463&r2=474464
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Nov 13 11:46:56 2006
@@ -71,6 +71,7 @@
 
 22. NUTCH-399 - Change CommandRunner to use concurrent api from jdk (siren)
 
+23. NUTCH-395 - Increase fetching speed (siren)
 
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=474464&r1=474463&r2=474464
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Mon Nov 
13 11:46:56 2006
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -16,103 +16,58 @@
  */
 package org.apache.nutch.metadata;
 
-// JDK imports
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
 import java.util.Enumeration;
 import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
 import java.util.Map;
 import java.util.Properties;
-
-// Commons Lang imports
-import org.apache.commons.lang.StringUtils;
-
-// Hadoop imports
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 
 
 /**
- * A syntax tolerant and multi-valued metadata container.
- *
- * All the static String fields declared by this class are used as reference
- * names for syntax correction on meta-data naming.
+ * A multi-valued metadata container.
  *
  * @author Chris Mattmann
  * @author Jérôme Charron
+ *
  */
-public class Metadata implements CreativeCommons,
- DublinCore,
- HttpHeaders,
- Nutch,
- Office,
- Writable {
-  
-
-  /** Used to format DC dates for the DATE metadata field */
-  public final static SimpleDateFormat DATE_FORMAT = 
-  new SimpleDateFormat("-MM-dd");
-  
-
-  private final static Map NAMES_IDX = new HashMap();
-  private static String[] normalized = null;
-
-  // Uses self introspection to fill the metanames index and the
-  // metanames list.
-  static {
-Field[] fields = Metadata.class.getFields();
-for (int i=0; i metadata = null;
+
 
-  
-  /** Constructs a new, empty metadata. */
+  /**
+   * Constructs a new, empty metadata.
+   */
   public Metadata() {
-metadata = new HashMap();
+metadata = new HashMap();
   }
 
   /**
+   * Returns true if named value is multivalued.
+   * @param name name of metadata
+   * @return true is named value is multivalued, false if single
+   * value or null
*/
-  public boolean isMultiValued(String name) {
-return getValues(name).length > 1;
+  public boolean isMultiValued(final String name) {
+return metadata.get(name) != null && metadata.get(name).length > 1;
   }
 
   /**
* Returns an array of the names contained in the metadata.
+   * @return Metadata names
*/
   public String[] names() {
-Iterator iter = metadata.keySet().iterator();
-List names = new ArrayList();
-while(iter.hasNext()) {
-  names.add(getNormalizedName((String) iter.next()));
-}
-return (String[]) names.toArray(new String[names.size()]);
+return metadata.keySet().toArray(new String[metadata.keySet().size()]);
   }
-  
+
   /**
* Get the value associated to a metadata name.
* If many values are assiociated to the specified name, then the first
@@ -121,12 +76,12 @@
* @param name of the metadata.
* @return the value associated to the specified metadata name.
*/
- 

svn commit: r473955 - in /lucene/nutch/trunk: contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/ contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/

2006-11-12 Thread siren
Author: siren
Date: Sun Nov 12 04:34:17 2006
New Revision: 473955

URL: http://svn.apache.org/viewvc?view=rev&rev=473955
Log:
NUTCH-400 add missing header to .java

Modified:

lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/CustomDiskPersistenceListener.java

lucene/nutch/trunk/contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/ClusteringCheckboxController.java

lucene/nutch/trunk/contrib/web2/src/main/java/org/apache/nutch/webapp/common/ServletContextServiceLocator.java

lucene/nutch/trunk/contrib/web2/src/main/java/org/apache/nutch/webapp/common/Startable.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java

lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java

lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java

lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java

lucene/nutch/trunk/src/test/org/apache/nutch/searcher/DistributedSearchTest.java

lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java

Modified: 
lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/CustomDiskPersistenceListener.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/CustomDiskPersistenceListener.java?view=diff&rev=473955&r1=473954&r2=473955
==
--- 
lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/CustomDiskPersistenceListener.java
 (original)
+++ 
lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/CustomDiskPersistenceListener.java
 Sun Nov 12 04:34:17 2006
@@ -1,3 +1,19 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
 package org.apache.nutch.cache;
 
 import 
com.opensymphony.oscache.plugins.diskpersistence.AbstractDiskPersistenceListener;

Modified: 
lucene/nutch/trunk/contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/ClusteringCheckboxController.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/ClusteringCheckboxController.java?view=diff&rev=473955&r1=473954&r2=473955
==
--- 
lucene/nutch/trunk/contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/ClusteringCheckboxController.java
 (original)
+++ 
lucene/nutch/trunk/contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/ClusteringCheckboxController.java
 Sun Nov 12 04:34:17 2006
@@ -1,3 +1,19 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
 package org.apache.nutch.webapp.controller;
 import java.io.IOExcept

svn commit: r473727 - /lucene/nutch/trunk/CHANGES.txt

2006-11-11 Thread siren
Author: siren
Date: Sat Nov 11 07:27:40 2006
New Revision: 473727

URL: http://svn.apache.org/viewvc?view=rev&rev=473727
Log:
NUTCH-399 Change CommandRunner to use concurrent api from jdk

Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=473727&r1=473726&r2=473727
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Nov 11 07:27:40 2006
@@ -69,6 +69,8 @@
 21. NUTCH-361, NUTCH-136 - When jobtracker is 'local' generate only one
 partition. (ab)
 
+22. NUTCH-399 - Change CommandRunner to use concurrent api from jdk (siren)
+
 
 Release 0.8 - 2006-07-25
 




  1   2   3   >