svn commit: r934694 - in /lucene/nutch/trunk/src: java/org/apache/nutch/indexer/solr/ java/org/apache/nutch/searcher/ java/org/apache/nutch/tools/ plugin/parse-tika/src/java/org/apache/nutch/parse/tik
Author: siren Date: Fri Apr 16 05:42:28 2010 New Revision: 934694 URL: http://svn.apache.org/viewvc?rev=934694&view=rev Log: add missing license headers Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java lucene/nutch/trunk/src/test/org/apache/nutch/searcher/QueryParamsTest.java lucene/nutch/trunk/src/test/org/apache/nutch/util/TestEncodingDetector.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=934694&r1=934693&r2=934694&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java Fri Apr 16 05:42:28 2010 @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.indexer.solr; import java.io.DataInput; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java?rev=934694&r1=934693&r2=934694&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java Fri Apr 16 05:42:28 2010 @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.searcher; import java.io.DataInput; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java?rev=934694&r1=934693&r2=934694&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java Fri Apr 16 05:42:28 2010 @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.tools; import java.io.IO
svn commit: r910173 - /lucene/nutch/trunk/src/web/jsp/search.jsp
Author: siren Date: Mon Feb 15 08:09:53 2010 New Revision: 910173 URL: http://svn.apache.org/viewvc?rev=910173&view=rev Log: NUTCH-793 search.jsp compile errors Modified: lucene/nutch/trunk/src/web/jsp/search.jsp Modified: lucene/nutch/trunk/src/web/jsp/search.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/search.jsp?rev=910173&r1=910172&r2=910173&view=diff == --- lucene/nutch/trunk/src/web/jsp/search.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/search.jsp Mon Feb 15 08:09:53 2010 @@ -204,7 +204,7 @@ // position this is good, bad?... ugly? Hits hits; try{ - query.getParams.initFrom(start + hitsToRetrieve, hitsPerSite, "site", sort, reverse); + query.getParams().initFrom(start + hitsToRetrieve, hitsPerSite, "site", sort, reverse); hits = bean.search(query); } catch (IOException e){ hits = new Hits(0,new Hit[0]);
svn commit: r910044 - in /lucene/nutch/trunk: conf/nutch-default.xml default.properties
Author: siren Date: Sun Feb 14 17:13:29 2010 New Revision: 910044 URL: http://svn.apache.org/viewvc?rev=910044&view=rev Log: NUTCH-792 update version Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/default.properties Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=910044&r1=910043&r2=910044&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Sun Feb 14 17:13:29 2010 @@ -113,7 +113,7 @@ http.agent.version - Nutch-1.0 + Nutch-1.1-dev A version string to advertise in the User-Agent header. Modified: lucene/nutch/trunk/default.properties URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/default.properties?rev=910044&r1=910043&r2=910044&view=diff == --- lucene/nutch/trunk/default.properties (original) +++ lucene/nutch/trunk/default.properties Sun Feb 14 17:13:29 2010 @@ -1,6 +1,6 @@ Name=Nutch name=nutch -version=1.0 +version=1.1-dev final.name=${name}-${version} year=2006
svn commit: r910041 - in /lucene/nutch/trunk: CHANGES.txt default.properties
Author: siren Date: Sun Feb 14 17:02:55 2010 New Revision: 910041 URL: http://svn.apache.org/viewvc?rev=910041&view=rev Log: NUTCH-790 Some external javadoc links are broken Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/default.properties Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=910041&r1=910040&r2=910041&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Feb 14 17:02:55 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-790 Some external javadoc links are broken (siren) + * NUTCH-766 Tika parser (jnioche via mattmann) * NUTCH-786 Improvement to the list of suffix domains (jnioche) Modified: lucene/nutch/trunk/default.properties URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/default.properties?rev=910041&r1=910040&r2=910041&view=diff == --- lucene/nutch/trunk/default.properties (original) +++ lucene/nutch/trunk/default.properties Sun Feb 14 17:02:55 2010 @@ -35,9 +35,9 @@ # Proxy Host and Port to use for building JavaDoc javadoc.proxy.host=-J-DproxyHost= javadoc.proxy.port=-J-DproxyPort= -javadoc.link.java=http://java.sun.com/j2se/1.4.2/docs/api/ -javadoc.link.lucene=http://jakarta.apache.org/lucene/docs/api/ -javadoc.link.hadoop=http://lucene.apache.org/hadoop/docs/api/ +javadoc.link.java=http://java.sun.com/javase/6/docs/api/ +javadoc.link.lucene=http://lucene.apache.org/java/2_9_1/api/all +javadoc.link.hadoop=http://hadoop.apache.org/common/docs/r0.20.1/api/ javadoc.packages=org.apache.nutch.* dist.dir=${build.dir}/${final.name}
svn commit: r905410 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/searcher/ src/test/org/apache/nutch/searcher/ src/web/jsp/
Author: siren Date: Mon Feb 1 20:47:34 2010 New Revision: 905410 URL: http://svn.apache.org/viewvc?rev=905410&view=rev Log: NUTCH-775 Enhance Searcher interface Added: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryParams.java lucene/nutch/trunk/src/test/org/apache/nutch/searcher/QueryParamsTest.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneSearchBean.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java lucene/nutch/trunk/src/web/jsp/search.jsp Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=905410&r1=905409&r2=905410&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Feb 1 20:47:34 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-775 Enhance searcher interface (siren) + * NUTCH-781 Update Tika to v0.6 (jnioche) * NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count (stack + jnioche) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java?rev=905410&r1=905409&r2=905410&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java Mon Feb 1 20:47:34 2010 @@ -49,10 +49,6 @@ private int id; private Query query; -private int numHits; -private String dedupField; -private String sortField; -private boolean reverse; public SearchTask(int id) { this.id = id; @@ -62,16 +58,20 @@ if (!liveServers[id]) { return null; } - return beans[id].search(query, numHits, dedupField, sortField, reverse); + return beans[id].search(query); } +/** + * @deprecated since 1.1, use {...@link #setSearchArgs(Query)} instead + */ public void setSearchArgs(Query query, int numHits, String dedupField, String sortField, boolean reverse) { this.query = query; - this.numHits = numHits; - this.dedupField = dedupField; - this.sortField = sortField; - this.reverse = reverse; + query.setParams(new QueryParams(numHits, QueryParams.DEFAULT_MAX_HITS_PER_DUP, dedupField, sortField, reverse)); +} + +private void setSearchArgs(Query query) { + this.query = query; } } @@ -199,12 +199,10 @@ return beans[hit.getIndexNo()].getExplanation(query, hit); } - public Hits search(Query query, int numHits, String dedupField, - String sortField, boolean reverse) throws IOException { - + @Override + public Hits search(Query query) throws IOException { for (Callable task : searchTasks) { - ((SearchTask)task).setSearchArgs(query, numHits, dedupField, sortField, - reverse); + ((SearchTask)task).setSearchArgs(query); } List> allHits; @@ -216,10 +214,12 @@ } PriorityQueue queue;// cull top hits from results -if (sortField == null || reverse) { - queue = new PriorityQueue(numHits); +if (query.getParams().getSortField() == null +|| query.getParams().isReverse()) { + queue = new PriorityQueue(query.getParams().getNumHits()); } else { - queue = new PriorityQueue(numHits, new Comparator() { + queue = new PriorityQueue(query.getParams().getNumHits(), + new Comparator() { public int compare(Hit h1, Hit h2) { return h2.compareTo(h1); // reverse natural order } @@ -251,7 +251,8 @@ Hit newHit = new Hit(i, hit.getUniqueKey(), hit.getSortValue(), hit.getDedupValue()); queue.add(newHit); -if (queue.size() > numHits) { // if hit queue overfull +if (queue.size() > query.getParams().getNumHits()) { + // if hit queue overfull queue.remove(); } } @@ -265,6 +266,15 @@ return new Hits(totalHits, culledResults); } + @Override + @Deprecated + public Hits search(Query query, int numHits, String dedupField, + String sortField, boolean reverse) throws IOException { + +qu
svn commit: r759345 [2/2] - in /lucene/nutch/trunk: site/ src/site/src/documentation/content/xdocs/
Modified: lucene/nutch/trunk/site/tutorial.pdf URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/tutorial.pdf?rev=759345&r1=759344&r2=759345&view=diff == --- lucene/nutch/trunk/site/tutorial.pdf (original) +++ lucene/nutch/trunk/site/tutorial.pdf Fri Mar 27 20:50:56 2009 @@ -419,8 +419,8 @@ 62 0 obj << /Type /Font /Subtype /Type1 -/Name /F3 -/BaseFont /Helvetica-Bold +/Name /F1 +/BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 63 0 obj @@ -433,15 +433,15 @@ 64 0 obj << /Type /Font /Subtype /Type1 -/Name /F6 -/BaseFont /Times-Italic +/Name /F3 +/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 65 0 obj << /Type /Font /Subtype /Type1 -/Name /F1 -/BaseFont /Helvetica +/Name /F2 +/BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 66 0 obj @@ -454,8 +454,8 @@ 67 0 obj << /Type /Font /Subtype /Type1 -/Name /F2 -/BaseFont /Helvetica-Oblique +/Name /F6 +/BaseFont /Times-Italic /Encoding /WinAnsiEncoding >> endobj 68 0 obj @@ -479,7 +479,7 @@ endobj 3 0 obj << -/Font << /F3 62 0 R /F5 63 0 R /F1 65 0 R /F6 64 0 R /F9 66 0 R /F2 67 0 R /F7 68 0 R >> +/Font << /F1 62 0 R /F5 63 0 R /F3 64 0 R /F2 65 0 R /F9 66 0 R /F6 67 0 R /F7 68 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj @@ -618,11 +618,11 @@ 016361 0 n 016588 0 n 016744 0 n -016857 0 n -016967 0 n -017078 0 n -017186 0 n -017292 0 n +016852 0 n +016962 0 n +017075 0 n +017191 0 n +017297 0 n 017408 0 n trailer << Modified: lucene/nutch/trunk/site/tutorial8.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/tutorial8.html?rev=759345&r1=759344&r2=759345&view=diff == --- lucene/nutch/trunk/site/tutorial8.html (original) +++ lucene/nutch/trunk/site/tutorial8.html Fri Mar 27 20:50:56 2009 @@ -138,13 +138,16 @@ i18n -API Docs (0.7.2) +API Docs (1.0) + + +API Docs (0.9) API Docs (0.8.x) -API Docs (0.9) +API Docs (0.7.2) http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API Docs (nightly) Modified: lucene/nutch/trunk/site/tutorial8.pdf URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/tutorial8.pdf?rev=759345&r1=759344&r2=759345&view=diff == --- lucene/nutch/trunk/site/tutorial8.pdf (original) +++ lucene/nutch/trunk/site/tutorial8.pdf Fri Mar 27 20:50:56 2009 @@ -464,8 +464,8 @@ 68 0 obj << /Type /Font /Subtype /Type1 -/Name /F3 -/BaseFont /Helvetica-Bold +/Name /F1 +/BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 69 0 obj @@ -478,15 +478,15 @@ 70 0 obj << /Type /Font /Subtype /Type1 -/Name /F6 -/BaseFont /Times-Italic +/Name /F3 +/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 71 0 obj << /Type /Font /Subtype /Type1 -/Name /F1 -/BaseFont /Helvetica +/Name /F2 +/BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 72 0 obj @@ -499,8 +499,8 @@ 73 0 obj << /Type /Font /Subtype /Type1 -/Name /F2 -/BaseFont /Helvetica-Oblique +/Name /F6 +/BaseFont /Times-Italic /Encoding /WinAnsiEncoding >> endobj 74 0 obj @@ -524,7 +524,7 @@ endobj 3 0 obj << -/Font << /F3 68 0 R /F5 69 0 R /F1 71 0 R /F6 70 0 R /F9 72 0 R /F2 73 0 R /F7 74 0 R >> +/Font << /F1 68 0 R /F5 69 0 R /F3 70 0 R /F2 71 0 R /F9 72 0 R /F6 73 0 R /F7 74 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj @@ -669,11 +669,11 @@ 021535 0 n 021762 0 n 021918 0 n -022031 0 n -022141 0 n -022252 0 n -022360 0 n -022466 0 n +022026 0 n +022136 0 n +022249 0 n +022365 0 n +022471 0 n 022582 0 n trailer << Modified: lucene/nutch/trunk/site/version_control.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/version_control.html?rev=759345&r1=759344&r2=759345&view=diff == --- lucene/nutch/trunk/site/version_control.html (original) +++ lucene/nutch/trunk/site/version_control.html Fri Mar 27 20:50:56 2009 @@ -138,13 +138,16 @@ i18n -API Docs (0.7.2) +API Docs (1.0) + + +API Docs (0.9) API Docs (0.8.x) -API Docs (0.9) +API Docs (0.7.2) http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API Docs (nightly) Modified: lucene/nutch/trunk/site/version_control.pdf URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/version_control.pdf?rev=759345&r1=759344&r2=759345&view=diff == --- lucene/nutch/trunk/site/version_control.pdf (original) +++ lucene/nutch/trunk/
svn commit: r759328 - /lucene/nutch/tags/release-1.0/
Author: siren Date: Fri Mar 27 20:10:11 2009 New Revision: 759328 URL: http://svn.apache.org/viewvc?rev=759328&view=rev Log: Nutch 1.0 release. Added: lucene/nutch/tags/release-1.0/ - copied from r759327, lucene/nutch/tags/release-1.0-rc2/
svn commit: r757511 - /lucene/nutch/tags/release-1.0-rc2/
Author: siren Date: Mon Mar 23 19:18:47 2009 New Revision: 757511 URL: http://svn.apache.org/viewvc?rev=757511&view=rev Log: Nutch 1.0 rc2 Added: lucene/nutch/tags/release-1.0-rc2/ - copied from r757510, lucene/nutch/trunk/
svn commit: r757500 - /lucene/nutch/trunk/CHANGES.txt
Author: siren Date: Mon Mar 23 18:59:26 2009 New Revision: 757500 URL: http://svn.apache.org/viewvc?rev=757500&view=rev Log: update release date Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=757500&r1=757499&r2=757500&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 23 18:59:26 2009 @@ -1,6 +1,6 @@ Nutch Change Log -Release 1.0 - 2009-03-10 +Release 1.0 - 2009-03-23 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)
svn commit: r757327 - in /lucene/nutch/trunk: CHANGES.txt README.txt src/plugin/parse-pdf/lib/jai_codec.jar src/plugin/parse-pdf/lib/jai_core.jar src/plugin/parse-pdf/plugin.xml
Author: siren Date: Mon Mar 23 06:41:13 2009 New Revision: 757327 URL: http://svn.apache.org/viewvc?rev=757327&view=rev Log: NUTCH-722 remove JAI libs Removed: lucene/nutch/trunk/src/plugin/parse-pdf/lib/jai_codec.jar lucene/nutch/trunk/src/plugin/parse-pdf/lib/jai_core.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/README.txt lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=757327&r1=757326&r2=757327&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 23 06:41:13 2009 @@ -380,6 +380,8 @@ 143. NUTCH-715 - Subcollection plugin doesn't work with default subcollections.xml file (Dmitry Lihachev via siren) + +144. NUTCH-722 - Nutch contains JAI jars that we cannot redistribute Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/README.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/README.txt?rev=757327&r1=757326&r2=757327&view=diff == --- lucene/nutch/trunk/README.txt (original) +++ lucene/nutch/trunk/README.txt Mon Mar 23 06:41:13 2009 @@ -1,5 +1,19 @@ Apache Nutch README +Important note: Due to licensing issues we cannot provide two libraries that +are normally provided with PDFBox (jai_core.jar, jai_codec.jar), the parser +library we use for parsing PDF files. If you encounter unexpected problems when +working with PDF files please + +1. download the two missing libraries from: + http://pdfbox.cvs.sourceforge.net/viewvc/pdfbox/pdfbox/external/ + +2. Put them to directory src/plugin/parse-pdf/lib +3. follow the instructions in file src/plugin/parse-pdf/plugin.xml +4. Rebuild nutch. + + + Interesting files include: Modified: lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml?rev=757327&r1=757326&r2=757327&view=diff == --- lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml Mon Mar 23 06:41:13 2009 @@ -30,8 +30,12 @@ + +
svn commit: r756218 - /lucene/nutch/trunk/build.xml
Author: siren Date: Thu Mar 19 21:34:47 2009 New Revision: 756218 URL: http://svn.apache.org/viewvc?rev=756218&view=rev Log: NUTCH-727 Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=756218&r1=756217&r2=756218&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Thu Mar 19 21:34:47 2009 @@ -575,6 +575,7 @@ +
svn commit: r756210 - /lucene/nutch/trunk/KEYS
Author: siren Date: Thu Mar 19 21:26:52 2009 New Revision: 756210 URL: http://svn.apache.org/viewvc?rev=756210&view=rev Log: copy keys to trunk Added: lucene/nutch/trunk/KEYS - copied unchanged from r756209, lucene/nutch/dist/KEYS
svn commit: r756199 - /lucene/nutch/trunk/NOTICE.txt
Author: siren Date: Thu Mar 19 21:10:28 2009 New Revision: 756199 URL: http://svn.apache.org/viewvc?rev=756199&view=rev Log: NUTCH-725 Modified: lucene/nutch/trunk/NOTICE.txt Modified: lucene/nutch/trunk/NOTICE.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/NOTICE.txt?rev=756199&r1=756198&r2=756199&view=diff == --- lucene/nutch/trunk/NOTICE.txt (original) +++ lucene/nutch/trunk/NOTICE.txt Thu Mar 19 21:10:28 2009 @@ -71,4 +71,5 @@ Nutch includes Automaton: This package is Copyright © 2001-2008 Anders Møller. All rights reserved. - +Nutch includes Rome: +Copyright 2004 Sun Microsystems, Inc.
svn commit: r756198 [1/2] - /lucene/nutch/trunk/LICENSE.txt
Author: siren Date: Thu Mar 19 21:09:56 2009 New Revision: 756198 URL: http://svn.apache.org/viewvc?rev=756198&view=rev Log: NUTCH-723 Modified: lucene/nutch/trunk/LICENSE.txt
svn commit: r756192 - in /lucene/nutch/trunk/src/plugin/response-json/lib: ezmorph-1.0.6.LICENSE.txt json-lib-2.2.2-jdk15.LICENSE.txt
Author: siren Date: Thu Mar 19 20:53:42 2009 New Revision: 756192 URL: http://svn.apache.org/viewvc?rev=756192&view=rev Log: record licenses Added: lucene/nutch/trunk/src/plugin/response-json/lib/ezmorph-1.0.6.LICENSE.txt lucene/nutch/trunk/src/plugin/response-json/lib/json-lib-2.2.2-jdk15.LICENSE.txt Added: lucene/nutch/trunk/src/plugin/response-json/lib/ezmorph-1.0.6.LICENSE.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/response-json/lib/ezmorph-1.0.6.LICENSE.txt?rev=756192&view=auto == --- lucene/nutch/trunk/src/plugin/response-json/lib/ezmorph-1.0.6.LICENSE.txt (added) +++ lucene/nutch/trunk/src/plugin/response-json/lib/ezmorph-1.0.6.LICENSE.txt Thu Mar 19 20:53:42 2009 @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 +http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the term
svn commit: r756182 - /lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt
Author: siren Date: Thu Mar 19 20:41:19 2009 New Revision: 756182 URL: http://svn.apache.org/viewvc?rev=756182&view=rev Log: record license Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt?rev=756182&view=auto == --- lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.COPYING.txt Thu Mar 19 20:41:19 2009 @@ -0,0 +1,24 @@ +Copyright (c) 2001-2004 Anders Moeller +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file
svn commit: r756181 - /lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt
Author: siren Date: Thu Mar 19 20:40:19 2009 New Revision: 756181 URL: http://svn.apache.org/viewvc?rev=756181&view=rev Log: record license Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt?rev=756181&view=auto == --- lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.LICENCE.txt Thu Mar 19 20:40:19 2009 @@ -0,0 +1,17 @@ +dk.brics.automaton +-- + +Copyright (C) 2001-2004 Anders Moeller + +This source code in this package may be used under the terms of the +BSD license. Please read the file 'COPYING' for details. + +This package contains a full DFA/NFA implementation with Unicode +alphabet and support for all standard regular expression operations. + +For more information, go to the package home page at +http://www.brics.dk/~amoeller/automaton/ + + +Anders Moeller +amoel...@brics.dk \ No newline at end of file
svn commit: r756174 - /lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt
Author: siren Date: Thu Mar 19 20:32:15 2009 New Revision: 756174 URL: http://svn.apache.org/viewvc?rev=756174&view=rev Log: record license Added: lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt Added: lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt?rev=756174&view=auto == --- lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt (added) +++ lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.LICENSE.txt Thu Mar 19 20:32:15 2009 @@ -0,0 +1,14 @@ +Copyright 2004 Sun Microsystems, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +
svn commit: r756171 - /lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt
Author: siren Date: Thu Mar 19 20:27:58 2009 New Revision: 756171 URL: http://svn.apache.org/viewvc?rev=756171&view=rev Log: record license Added: lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt Added: lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt?rev=756171&view=auto == --- lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt (added) +++ lucene/nutch/trunk/lib/jets3t-0.6.1.LICENSE.txt Thu Mar 19 20:27:58 2009 @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 +http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) pa
svn commit: r756154 - /lucene/nutch/trunk/NOTICE.txt
Author: siren Date: Thu Mar 19 19:48:03 2009 New Revision: 756154 URL: http://svn.apache.org/viewvc?rev=756154&view=rev Log: NUTCH-725 Modified: lucene/nutch/trunk/NOTICE.txt Modified: lucene/nutch/trunk/NOTICE.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/NOTICE.txt?rev=756154&r1=756153&r2=756154&view=diff == --- lucene/nutch/trunk/NOTICE.txt (original) +++ lucene/nutch/trunk/NOTICE.txt Thu Mar 19 19:48:03 2009 @@ -1,2 +1,74 @@ +Apache Nutch +Copyright 2009 The Apache Software Foundation + This product includes software developed by The Apache Software Foundation (http://www.apache.org/). + +This product includes software developed by the following copyright owners: + +Nutch includes icu4j: +Copyright (c) 1995-2006 International Business Machines Corporation and +others + +Nutch includes Carrot2: +Copyright (C) 2002-2006, Dawid Weiss, Stanis�aw Osi�ski. +awid Weiss; Project administrator, various components, core; 2002; Poland +StanisÅaw, OsiÅski; Lingo clustering component, ODP Input; 2003; Poland +Karol GoÅembniak, Irmina MasÅowska; HAOG clustering component; 2006; Poznan University of Technology; Poland +MichaÅ, Wróblewski [*]; AHC clustering components; 2003; Poznan University of Technology, Poland +PaweÅ, Kowalik [*]; Inductive search engine wrapper; 2003; Poznan University of Technology, Poland +Steven, Schockaert [*]; Fuzzy Ants clustering component; 2004; University of Gent, Belgium +Lang, Ngo Chi [*]; Fuzzy Rough set clustering component; 2004; Warsaw University, Poland + +Nutch includes Saxpath: +Copyright (C) 2000-2002 werken digital. All rights reserved. + +Nutch includes jaxen: +Copyright 2003-2006 The Werken Company. All Rights Reserved. + +Nutch includes Jdom: +Copyright (C) 2000-2004 Jason Hunter & Brett McLaughlin. +All rights reserved + +Nutch includes SaxPath: +Copyright (C) 2000-2002 werken digital. All rights reserved. + +Nutch includes Snowball: +Copyright (c) 2001, Dr Martin Porter +(for the Java developments) Copyright (c) 2002, Richard Boulton. + +Nutch includes ViolinStrings: +Copyright (c) Michael Schmeling 1998, 2000 - All Rights Reserve + +Nutch includes Cyperneko: +(C) Copyright 2002,2003, Andy Clark. All rights reserved. + +Nutch includes Jena: +(c) Copyright 2000, 2001, 2002, 2003, 2004 Hewlett-Packard Development Company, LP +All rights reserved. + +Nutch includes BouncyCastle: +Copyright (c) 2000 - 2008 The Legion Of The Bouncy Castle (http://www.bouncycastle.org) + +Nutch includes FontBox: +Copyright (c) 2003-2005, www.fontbox.org + +Nutch includes JempBox: +Copyright (c) 2006-2007, www.jempbox.org +All rights reserved. + +Nutch includes PDFBox: +Copyright (c) 2003-2005, www.pdfbox.org +All rights reserved. + +Nutch includes JavaSWF: +Copyright (c) 2001-2005, David N. Main, All rights reserved. + +Nutch includes Json Lib: +This product includes software developed by Douglas Crockford +(http://www.crockford.com). + +Nutch includes Automaton: +This package is Copyright © 2001-2008 Anders Møller. All rights reserved. + +
svn commit: r756149 - in /lucene/nutch/trunk/src/plugin/lib-xml/lib: jaxen.LICENSE jdom.LICENSE saxpath.LICENSE
Author: siren Date: Thu Mar 19 19:33:11 2009 New Revision: 756149 URL: http://svn.apache.org/viewvc?rev=756149&view=rev Log: record licenses Added: lucene/nutch/trunk/src/plugin/lib-xml/lib/jaxen.LICENSE lucene/nutch/trunk/src/plugin/lib-xml/lib/jdom.LICENSE lucene/nutch/trunk/src/plugin/lib-xml/lib/saxpath.LICENSE Added: lucene/nutch/trunk/src/plugin/lib-xml/lib/jaxen.LICENSE URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-xml/lib/jaxen.LICENSE?rev=756149&view=auto == --- lucene/nutch/trunk/src/plugin/lib-xml/lib/jaxen.LICENSE (added) +++ lucene/nutch/trunk/src/plugin/lib-xml/lib/jaxen.LICENSE Thu Mar 19 19:33:11 2009 @@ -0,0 +1,33 @@ +/* + $Id: LICENSE.txt 1128 2006-02-05 21:49:04Z elharo $ + + Copyright 2003-2006 The Werken Company. All Rights Reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + + * Neither the name of the Jaxen Project nor the names of its +contributors may be used to endorse or promote products derived +from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + */ Added: lucene/nutch/trunk/src/plugin/lib-xml/lib/jdom.LICENSE URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-xml/lib/jdom.LICENSE?rev=756149&view=auto == --- lucene/nutch/trunk/src/plugin/lib-xml/lib/jdom.LICENSE (added) +++ lucene/nutch/trunk/src/plugin/lib-xml/lib/jdom.LICENSE Thu Mar 19 19:33:11 2009 @@ -0,0 +1,55 @@ +/*-- + + $Id: LICENSE.txt,v 1.11 2004/02/06 09:32:57 jhunter Exp $ + + Copyright (C) 2000-2004 Jason Hunter & Brett McLaughlin. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright +notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions, and the disclaimer that follows +these conditions in the documentation and/or other materials +provided with the distribution. + + 3. The name "JDOM" must not be used to endorse or promote products +derived from this software without prior written permission. For +written permission, please contact . + + 4. Products derived from this software may not be called "JDOM", nor +may "JDOM" appear in their name, without prior written permission +from the JDOM Project Management . + + In addition, we request (but do not require) that you include in the + end-user documentation provided with the redistribution and/or in the + software itself an acknowledgement equivalent to the following: + "This product includes software developed by the + JDOM Project (http://www.jdom.org/)." + Alternatively, the acknowledgment may be graphical using the logos + available at http://www.jdom.org/images/logos. + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE
svn commit: r755994 - /lucene/nutch/trunk/README.txt
Author: siren Date: Thu Mar 19 13:47:32 2009 New Revision: 755994 URL: http://svn.apache.org/viewvc?rev=755994&view=rev Log: NUTCH-726 Modified: lucene/nutch/trunk/README.txt Modified: lucene/nutch/trunk/README.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/README.txt?rev=755994&r1=755993&r2=755994&view=diff == --- lucene/nutch/trunk/README.txt (original) +++ lucene/nutch/trunk/README.txt Thu Mar 19 13:47:32 2009 @@ -1,4 +1,4 @@ -Nutch README +Apache Nutch README Interesting files include:
svn commit: r752004 - /lucene/nutch/tags/release-1.0-rc1/
Author: siren Date: Tue Mar 10 07:15:13 2009 New Revision: 752004 URL: http://svn.apache.org/viewvc?rev=752004&view=rev Log: Nutch 1.0 rc1 Added: lucene/nutch/tags/release-1.0-rc1/ - copied from r752003, lucene/nutch/trunk/
svn commit: r752001 - /lucene/nutch/trunk/CHANGES.txt
Author: siren Date: Tue Mar 10 07:08:29 2009 New Revision: 752001 URL: http://svn.apache.org/viewvc?rev=752001&view=rev Log: prepare for release Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=752001&r1=752000&r2=752001&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Mar 10 07:08:29 2009 @@ -1,6 +1,6 @@ Nutch Change Log -Release 1.0 - 2009-03-08 +Release 1.0 - 2009-03-10 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)
svn commit: r752000 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/util/DomUtil.java src/plugin/build.xml src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollecti
Author: siren Date: Tue Mar 10 07:07:22 2009 New Revision: 752000 URL: http://svn.apache.org/viewvc?rev=752000&view=rev Log: NUTCH-715 - Subcollection plugin doesn't work with default subcollections.xml file. Contributed by Dmitry Lihachev Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=752000&r1=751999&r2=752000&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Mar 10 07:07:22 2009 @@ -378,6 +378,9 @@ 142. NUTCH-684 - Dedup support for Solr. (dogacan) +143. NUTCH-715 - Subcollection plugin doesn't work with default + subcollections.xml file (Dmitry Lihachev via siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java?rev=752000&r1=751999&r2=752000&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java Tue Mar 10 07:07:22 2009 @@ -60,7 +60,11 @@ input = new InputSource(is); input.setEncoding("UTF-8"); parser.parse(input); - element = (Element) parser.getDocument().getChildNodes().item(0); + int i = 0; + while (! (parser.getDocument().getChildNodes().item(i) instanceof Element)) { + i++; + } + element = (Element)parser.getDocument().getChildNodes().item(i); } catch (FileNotFoundException e) { e.printStackTrace(LogUtil.getWarnStream(LOG)); } catch (SAXException e) { Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=752000&r1=751999&r2=752000&view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Tue Mar 10 07:07:22 2009 @@ -112,6 +112,7 @@ + Modified: lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java?rev=752000&r1=751999&r2=752000&view=diff == --- lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java (original) +++ lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Tue Mar 10 07:07:22 2009 @@ -49,6 +49,7 @@ public void testInput(){ StringBuffer xml=new StringBuffer(); xml.append(""); +xml.append(""); xml.append(""); xml.append(""); xml.append("nutch collection");
svn commit: r751480 - /lucene/nutch/tags/release-1.0-rc0/
Author: siren Date: Sun Mar 8 17:38:39 2009 New Revision: 751480 URL: http://svn.apache.org/viewvc?rev=751480&view=rev Log: Nutch 1.0 rc0 Added: lucene/nutch/tags/release-1.0-rc0/ - copied from r751479, lucene/nutch/trunk/
svn commit: r751475 - /lucene/nutch/trunk/CHANGES.txt
Author: siren Date: Sun Mar 8 17:30:52 2009 New Revision: 751475 URL: http://svn.apache.org/viewvc?rev=751475&view=rev Log: the version is indeed 1.0 Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=751475&r1=751474&r2=751475&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Mar 8 17:30:52 2009 @@ -1,6 +1,6 @@ Nutch Change Log -Release 0.9 - 2009-03-08 +Release 1.0 - 2009-03-08 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)
svn commit: r751471 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml default.properties
Author: siren Date: Sun Mar 8 17:20:59 2009 New Revision: 751471 URL: http://svn.apache.org/viewvc?rev=751471&view=rev Log: preparing for release Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/default.properties Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=751471&r1=751470&r2=751471&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Mar 8 17:20:59 2009 @@ -1,6 +1,6 @@ Nutch Change Log -Unreleased changes (1.0-dev) +Release 0.9 - 2009-03-08 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=751471&r1=751470&r2=751471&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Sun Mar 8 17:20:59 2009 @@ -113,7 +113,7 @@ http.agent.version - Nutch-1.0-dev + Nutch-1.0 A version string to advertise in the User-Agent header. Modified: lucene/nutch/trunk/default.properties URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/default.properties?rev=751471&r1=751470&r2=751471&view=diff == --- lucene/nutch/trunk/default.properties (original) +++ lucene/nutch/trunk/default.properties Sun Mar 8 17:20:59 2009 @@ -1,6 +1,6 @@ Name=Nutch name=nutch -version=1.0-dev +version=1.0 final.name=${name}-${version} year=2006
svn commit: r749289 - in /lucene/nutch/trunk: ./ bin/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/test/org/apache/nutch/fetcher/
Author: siren Date: Mon Mar 2 12:28:22 2009 New Revision: 749289 URL: http://svn.apache.org/viewvc?rev=749289&view=rev Log: NUTCH-669 - Consolidate code for Fetcher and Fetcher2 Added: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java - copied, changed from r747319, lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Removed: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/bin/nutch lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=749289&r1=749288&r2=749289&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 2 12:28:22 2009 @@ -372,6 +372,8 @@ 139. NUTCH-700 - Neko1.9.11 goes into a loop (Julien Nioche, siren) +140. NUTCH-669 - Consolidate code for Fetcher and Fetcher2 (siren) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?rev=749289&r1=749288&r2=749289&view=diff == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Mon Mar 2 12:28:22 2009 @@ -41,7 +41,6 @@ echo " generate generate new segments to fetch from crawl db" echo " freegen generate new segments to fetch from text files" echo " fetch fetch a segment's pages" - echo " fetch2fetch a segment's pages using Fetcher2 implementation" echo " parse parse a segment's pages" echo " readseg read / dump segment data" echo " mergesegs merge several segments, with optional filtering and slicing" Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=749289&r1=749288&r2=749289&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Mon Mar 2 12:28:22 2009 @@ -24,7 +24,6 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.nutch.fetcher.Fetcher; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; @@ -36,6 +35,8 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; +import org.apache.nutch.fetcher.Fetcher; + public class Crawl { public static final Log LOG = LogFactory.getLog(Crawl.class); @@ -118,7 +119,7 @@ LOG.info("Stopping at depth=" + i + " - no more URLs to fetch."); break; } - fetcher.fetch(segment, threads); // fetch it + fetcher.fetch(segment, threads, org.apache.nutch.fetcher.Fetcher.isParsing(conf)); // fetch it if (!Fetcher.isParsing(job)) { parseSegment.parse(segment);// parse it, if needed } Copied: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (from r747319, lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java) URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?p2=lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java&p1=lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java&r1=747319&r2=749289&rev=749289&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Mar 2 12:28:22 2009 @@ -1,9 +1,10 @@ -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE
svn commit: r749256 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar src/plugin/lib-nekohtml/lib/nekohtml-1.9.11.jar src/plugin/lib-nekohtml/plugin.xml
Author: siren Date: Mon Mar 2 10:16:51 2009 New Revision: 749256 URL: http://svn.apache.org/viewvc?rev=749256&view=rev Log: NUTCH-700 - revert to nekohtml-0.9.4 Added: lucene/nutch/trunk/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar (with props) Removed: lucene/nutch/trunk/src/plugin/lib-nekohtml/lib/nekohtml-1.9.11.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=749256&r1=749255&r2=749256&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 2 10:16:51 2009 @@ -369,6 +369,8 @@ 138. NUTCH-419 - Unavailable robots.txt kills fetch (Carsten Lehmann, Doug Cook via ab) + +139. NUTCH-700 - Neko1.9.11 goes into a loop (Julien Nioche, siren) Release 0.9 - 2007-04-02 Added: lucene/nutch/trunk/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar?rev=749256&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml?rev=749256&r1=749255&r2=749256&view=diff == --- lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml Mon Mar 2 10:16:51 2009 @@ -29,7 +29,7 @@ provider-name="org.cyberneko"> - +
svn commit: r748408 - in /lucene/nutch/trunk: CHANGES.txt conf/schema.xml
Author: siren Date: Fri Feb 27 06:21:37 2009 New Revision: 748408 URL: http://svn.apache.org/viewvc?rev=748408&view=rev Log: NUTCH-699 - Add an "official" solr schema for solr integration. Contributed by dogacan, Dmitry Lihachev Added: lucene/nutch/trunk/conf/schema.xml Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=748408&r1=748407&r2=748408&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Feb 27 06:21:37 2009 @@ -361,6 +361,9 @@ 135. NUTCH-698 - CrawlDb is corrupted after a few crawl cycles (dogacan via siren) + +136. NUTCH-699 - Add an "official" solr schema for solr integration (dogacan, + Dmitry Lihachev via siren) Release 0.9 - 2007-04-02 Added: lucene/nutch/trunk/conf/schema.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/schema.xml?rev=748408&view=auto == --- lucene/nutch/trunk/conf/schema.xml (added) +++ lucene/nutch/trunk/conf/schema.xml Fri Feb 27 06:21:37 2009 @@ -0,0 +1,109 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +id +content + + + \ No newline at end of file
svn commit: r747324 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDatum.java src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Author: siren Date: Tue Feb 24 10:09:36 2009 New Revision: 747324 URL: http://svn.apache.org/viewvc?rev=747324&view=rev Log: NUTCH-698 - CrawlDb is corrupted after a few crawl cycles, contributed by dogacan Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747324&r1=747323&r2=747324&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 10:09:36 2009 @@ -359,6 +359,9 @@ 134. NUTCH-247 - Robot parser to restrict (kubes, siren) +135. NUTCH-698 - CrawlDb is corrupted after a few crawl cycles (dogacan + via siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=747324&r1=747323&r2=747324&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Tue Feb 24 10:09:36 2009 @@ -204,7 +204,17 @@ } public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) { - this.metaData = mapWritable; + this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable); + } + + /** Add all metadata from other CrawlDatum to this CrawlDatum. +* +* @param other CrawlDatum +*/ + public void putAllMetaData(CrawlDatum other) { + for (Entry e : other.getMetaData().entrySet()) { + metaData.put(e.getKey(), e.getValue()); + } } /** Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=747324&r1=747323&r2=747324&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Tue Feb 24 10:09:36 2009 @@ -131,10 +131,10 @@ if (oldSet) { // copy metadata from old, if exists if (old.getMetaData().size() > 0) { -result.getMetaData().putAll(old.getMetaData()); +result.putAllMetaData(old); // overlay with new, if any if (fetch.getMetaData().size() > 0) - result.getMetaData().putAll(fetch.getMetaData()); + result.putAllMetaData(fetch); } // set the most recent valid value of modifiedTime if (old.getModifiedTime() > 0 && fetch.getModifiedTime() == 0) {
svn commit: r747319 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/fetcher/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/test/ src/test/org/apache/nutch/fetcher/
Author: siren Date: Tue Feb 24 09:54:30 2009 New Revision: 747319 URL: http://svn.apache.org/viewvc?rev=747319&view=rev Log: NUTCH-247 - Robot parser to restrict, contributed by kubes Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java lucene/nutch/trunk/src/test/crawl-tests.xml lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747319&r1=747318&r2=747319&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 09:54:30 2009 @@ -357,6 +357,8 @@ 133. NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links set at cross domain redirects (Remco Verhoef, dogacan via siren) +134. NUTCH-247 - Robot parser to restrict (kubes, siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=747319&r1=747318&r2=747319&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Tue Feb 24 09:54:30 2009 @@ -933,6 +933,8 @@ public void fetch(Path segment, int threads, boolean parsing) throws IOException { +checkConfiguration(); + if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting"); LOG.info("Fetcher: segment: " + segment); @@ -995,4 +997,40 @@ fetcher.fetch(segment, threads, parsing); // run the Fetcher } + + private void checkConfiguration() { + +// ensure that a value has been set for the agent name and that that +// agent name is the first value in the agents we advertise for robot +// rules parsing +String agentName = getConf().get("http.agent.name"); +if (agentName == null || agentName.trim().length() == 0) { + String message = "Fetcher: No agents listed in 'http.agent.name'" + + " property."; + if (LOG.isFatalEnabled()) { +LOG.fatal(message); + } + throw new IllegalArgumentException(message); +} else { + + // get all of the agents that we advertise + String agentNames = getConf().get("http.robots.agents"); + StringTokenizer tok = new StringTokenizer(agentNames, ","); + ArrayList agents = new ArrayList(); + while (tok.hasMoreTokens()) { +agents.add(tok.nextToken().trim()); + } + + // if the first one is not equal to our agent name, log fatal and throw + // an exception + if (!(agents.get(0)).equalsIgnoreCase(agentName)) { +String message = "Fetcher: Your 'http.agent.name' value should be " ++ "listed first in 'http.robots.agents' property."; +if (LOG.isWarnEnabled()) { + LOG.warn(message); +} + } +} + } + } Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=747319&r1=747318&r2=747319&view=diff == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Tue Feb 24 09:54:30 2009 @@ -223,9 +223,6 @@ // Grab the agent names we advertise to robots files. // String agentName = conf.get("http.agent.name"); -if (null == agentName) { - throw new RuntimeException("Agent name not configured!"); -} String agentNames = conf.get("http.robots.agents"); StringTokenizer tok = new StringTokenizer(agentNames, ","); ArrayList agents = new ArrayList(); @@ -233,23 +230,6 @@ agents.add(tok.nextToken().trim()); } -// -// If there are no agents for robots-parsing, use our -// default agent-string. If both are present, our agent-string -// should be the first one we advertise to robots-parsing. -// -if (agents.size() == 0) { - agents.add(agentName); - if (LOG.isFatalEnabled()) { -LOG.fata
svn commit: r747312 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher2.java
Author: siren Date: Tue Feb 24 09:18:03 2009 New Revision: 747312 URL: http://svn.apache.org/viewvc?rev=747312&view=rev Log: NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links set at cross domain redirects, contributed by Remco Verhoef, dogacan Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747312&r1=747311&r2=747312&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 09:18:03 2009 @@ -349,11 +349,14 @@ 130. NUTCH-563 - Include custom fields in BasicQueryFilter (Julien Nioche via siren) -131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin +131. NUTCH-695 - Incorrect mime type detection by MoreIndexingFilter plugin (Dmitry Lihachev via siren) 132. NUTCH-694 - Distributed Search Server fails (siren) +133. NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links + set at cross domain redirects (Remco Verhoef, dogacan via siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=747312&r1=747311&r2=747312&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Tue Feb 24 09:18:03 2009 @@ -94,7 +94,6 @@ throws IOException { FileStatus[] files = listStatus(job); FileSplit[] splits = new FileSplit[files.length]; - FileSystem fs = FileSystem.get(job); for (int i = 0; i < files.length; i++) { FileStatus cur = files[i]; splits[i] = new FileSplit(cur.getPath(), 0, @@ -443,6 +442,7 @@ private String reprUrl; private boolean redirecting; private int redirectCount; +private boolean ignoreExternalLinks; public FetcherThread(Configuration conf) { this.setDaemon(true); // don't hang JVM on exit @@ -457,6 +457,8 @@ // backward-compatible default setting this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true); this.maxRedirect = conf.getInt("http.redirect.max", 3); + this.ignoreExternalLinks = +conf.getBoolean("db.ignore.external.links", false); } public void run() { @@ -673,6 +675,22 @@ throws MalformedURLException, URLFilterException { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = urlFilters.filter(newUrl); + + if (ignoreExternalLinks) { +try { + String origHost = new URL(urlString).getHost().toLowerCase(); + String newHost = new URL(newUrl).getHost().toLowerCase(); + if (!origHost.equals(newHost)) { +if (LOG.isDebugEnabled()) { + LOG.debug(" - ignoring redirect " + redirType + " from " + + urlString + " to " + newUrl + + " because external links are ignored"); +} +return null; + } +} catch (MalformedURLException e) { } + } + if (newUrl != null && !newUrl.equals(urlString)) { reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); url = new Text(newUrl);
svn commit: r746900 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/searcher/NutchBean.java
Author: siren Date: Mon Feb 23 07:02:30 2009 New Revision: 746900 URL: http://svn.apache.org/viewvc?rev=746900&view=rev Log: NUTCH-694 - Distributed Search Server fails Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=746900&r1=746899&r2=746900&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Feb 23 07:02:30 2009 @@ -351,6 +351,8 @@ 131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin (Dmitry Lihachev via siren) + +132. NUTCH-694 - Distributed Search Server fails (siren) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=746900&r1=746899&r2=746900&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Mon Feb 23 07:02:30 2009 @@ -48,13 +48,10 @@ //LogFormatter.setShowThreadIDs(true); // } - private String[] segmentNames; - private SearchBean searchBean; private SegmentBean segmentBean; private final HitInlinks linkDb; - /** BooleanQuery won't permit more than 32 required/prohibited clauses. We * don't want to use too many of those. */ private static final int MAX_PROHIBITED_TERMS = 20; @@ -149,8 +146,8 @@ } } - public String[] getSegmentNames() { -return segmentNames; + public String[] getSegmentNames() throws IOException { +return segmentBean.getSegmentNames(); } public Hits search(Query query, int numHits) throws IOException { @@ -374,17 +371,23 @@ final Configuration conf = NutchConfiguration.create(); final NutchBean bean = new NutchBean(conf); -final Query query = Query.parse(args[0], conf); -final Hits hits = bean.search(query, 10); -System.out.println("Total hits: " + hits.getTotal()); -final int length = (int)Math.min(hits.getTotal(), 10); -final Hit[] show = hits.getHits(0, length); -final HitDetails[] details = bean.getDetails(show); -final Summary[] summaries = bean.getSummary(details, query); +try { + final Query query = Query.parse(args[0], conf); + final Hits hits = bean.search(query, 10); + System.out.println("Total hits: " + hits.getTotal()); + final int length = (int)Math.min(hits.getTotal(), 10); + final Hit[] show = hits.getHits(0, length); + final HitDetails[] details = bean.getDetails(show); + final Summary[] summaries = bean.getSummary(details, query); -for (int i = 0; i < hits.getLength(); i++) { - System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]); + for (int i = 0; i < hits.getLength(); i++) { +System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]); + } +} catch (Throwable t) { + LOG.error("Exception occured while executing search: " + t, t); + System.exit(1); } +System.exit(0); } public long getProtocolVersion(String className, long clientVersion) @@ -394,7 +397,7 @@ final RPCSearchBean rpcBean = (RPCSearchBean)searchBean; return rpcBean.getProtocolVersion(className, clientVersion); -} else if (SegmentBean.class.getName().equals(className) && +} else if (RPCSegmentBean.class.getName().equals(className) && segmentBean instanceof RPCSegmentBean) { final RPCSegmentBean rpcBean = (RPCSegmentBean)segmentBean;
svn commit: r745808 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java src/plugin/index-more/src/test/org/apache/nutch/indexer/m
Author: siren Date: Thu Feb 19 10:25:47 2009 New Revision: 745808 URL: http://svn.apache.org/viewvc?rev=745808&view=rev Log: NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin, contributed by Dmitry Lihachev Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745808&r1=745807&r2=745808&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Feb 19 10:25:47 2009 @@ -348,6 +348,9 @@ 130. NUTCH-563 - Include custom fields in BasicQueryFilter (Julien Nioche via siren) + +131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin + (Dmitry Lihachev via siren) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=745808&r1=745807&r2=745808&view=diff == --- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu Feb 19 10:25:47 2009 @@ -199,20 +199,20 @@ MimeType mimeType = null; String contentType = data.getMeta(Response.CONTENT_TYPE); if (contentType == null) { -// Note by Jerome Charron on 20050415: -// Content Type not solved by a previous plugin -// Or unable to solve it... Trying to find it -// Should be better to use the doc content too -// (using MimeTypes.getMimeType(byte[], String), but I don't know -// which field it is? -// if (MAGIC) { -// contentType = MIME.getMimeType(url, content); -// } else { -// contentType = MIME.getMimeType(url); -// } -mimeType = MIME.getMimeType(url); + // Note by Jerome Charron on 20050415: + // Content Type not solved by a previous plugin + // Or unable to solve it... Trying to find it + // Should be better to use the doc content too + // (using MimeTypes.getMimeType(byte[], String), but I don't know + // which field it is? + // if (MAGIC) { + // contentType = MIME.getMimeType(url, content); + // } else { + // contentType = MIME.getMimeType(url); + // } + mimeType = MIME.getMimeType(url); } else { -mimeType = MIME.forName(contentType); + mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType)); } // Checks if we solved the content-type. Modified: lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=745808&r1=745807&r2=745808&view=diff == --- lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Thu Feb 19 10:25:47 2009 @@ -16,10 +16,30 @@ */ package org.apache.nutch.indexer.more; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; + import junit.framework.TestCase; public class TestMoreIndexingFilter extends TestCase { + public void testContentType() throws IndexingException { +Configuration conf = NutchConfiguration.create(); +assertContentType(conf, "text/html", "text/html"); +assertContentType(conf, "text/html; charset=UTF-8", "text/html"); + } + public void testGetParts() { String[] parts = MoreIndexingFilter.getParts("text/html"); assertParts(parts, 2, "text&
svn commit: r745517 - /lucene/nutch/trunk/contrib/web2/
Author: siren Date: Wed Feb 18 14:03:18 2009 New Revision: 745517 URL: http://svn.apache.org/viewvc?rev=745517&view=rev Log: remove web2 as agreed on nutch-dev Removed: lucene/nutch/trunk/contrib/web2/
svn commit: r745503 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
Author: siren Date: Wed Feb 18 12:53:12 2009 New Revision: 745503 URL: http://svn.apache.org/viewvc?rev=745503&view=rev Log: NUTCH-563 Include custom fields in BasicQueryFilter, contributed by Julien Nioche Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745503&r1=745502&r2=745503&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:53:12 2009 @@ -346,6 +346,9 @@ 129. NUTCH-691 - Update jakarta poi jars to the most relevant version (Dmitry Lihachev via siren) +130. NUTCH-563 - Include custom fields in BasicQueryFilter + (Julien Nioche via siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=745503&r1=745502&r2=745503&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb 18 12:53:12 2009 @@ -1119,6 +1119,15 @@ + + Modified: lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java?rev=745503&r1=745502&r2=745503&view=diff == --- lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java (original) +++ lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java Wed Feb 18 12:53:12 2009 @@ -22,6 +22,13 @@ import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import org.apache.nutch.analysis.NutchDocumentAnalyzer; import org.apache.nutch.analysis.CommonGrams; @@ -31,7 +38,12 @@ import org.apache.hadoop.conf.Configuration; /** The default query filter. Query terms in the default query field are - * expanded to search the url, anchor and content document fields.*/ + * expanded to search the url, anchor and content document fields. + * Additional fields can be added by specifying parameters of the form : query.basic.(fieldname).boost + * to the configuration files (see nutch-default.xml for an example).Such fields will be used in the clauses + * generated by the BasicQueryFilter e.g. for a user query A B, it generates +(field1:A field2:A ...) +(field1:B field2:B). + * If you don't want the additional fields to be included in the clauses you will need to implement a custom query filter for it. + **/ public class BasicQueryFilter implements QueryFilter { private static final int URL_BOOST = 0; @@ -44,7 +56,7 @@ private float PHRASE_BOOST; - private static final String[] FIELDS = + private String[] FIELDS = { "url", "anchor", "content", "title", "host" }; private float[] FIELD_BOOSTS = new float[5]; @@ -177,9 +189,51 @@ this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f); this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f); this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f); +findAdditionalFields(conf); } public Configuration getConf() { return this.conf; } + + /** Searches for parameters of the form : query.basic.(fieldname).boost + * and adds the fielname to the list of default fields. + **/ + private void findAdditionalFields(Configuration conf) { +// get additional fields specified in parameters +Pattern pat = Pattern.compile("query\\.basic\\.(.+)\\.boost"); +Iterator confEntriesIterator = conf.iterator(); +List existingFields = java.util.Arrays.asList(FIELDS); +ArrayList tempfieldNames = new ArrayList(); +ArrayList tempfieldBoosts = new ArrayList(); +while (confEntriesIterator.hasNext()){ + Map.Entry entry = (Map.Entry) confEntriesIterator.next(); + String key = entry.getKey().toString(); + Matcher match = pat.matcher(key); + if (!match.matches())continue; + String fieldName = match.group(1); + if (fieldName!=null){ +// check whether it matches one of t
svn commit: r745499 - in /lucene/nutch/trunk: ./ src/plugin/lib-jakarta-poi/ src/plugin/lib-jakarta-poi/lib/ src/plugin/parse-msword/ src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ sr
Author: siren Date: Wed Feb 18 12:43:04 2009 New Revision: 745499 URL: http://svn.apache.org/viewvc?rev=745499&view=rev Log: NUTCH-691 - Update jakarta poi jars to the most relevant version, contributed by Dmitry Lihachev Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar (with props) lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar (with props) Removed: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml lucene/nutch/trunk/src/plugin/parse-msword/build.xml lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745499&r1=745498&r2=745499&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:43:04 2009 @@ -343,6 +343,9 @@ 128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException (Stefan Will, siren) +129. NUTCH-691 - Update jakarta poi jars to the most relevant version + (Dmitry Lihachev via siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar?rev=745499&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar?rev=745499&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml?rev=745499&r1=745498&r2=745499&view=diff == --- lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml Wed Feb 18 12:43:04 2009 @@ -29,10 +29,10 @@ provider-name="jakarta.apache.org"> - + - + Modified: lucene/nutch/trunk/src/plugin/parse-msword/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/build.xml?rev=745499&r1=745498&r2=745499&view=diff == --- lucene/nutch/trunk/src/plugin/parse-msword/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/build.xml Wed Feb 18 12:43:04 2009 @@ -44,7 +44,8 @@ - - + + + Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java?rev=745499&r1=745498&r2=745499&view=diff == --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java Wed Feb 18 12:43:04 2009 @@ -53,8 +53,9 @@ int chpTableSize = LittleEndian.getInt(mainStream, 0xbc); // get a list of character properties + Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, - chpTableSize, fcMi
svn commit: r745448 - /lucene/nutch/trunk/build.xml
Author: siren Date: Wed Feb 18 09:18:07 2009 New Revision: 745448 URL: http://svn.apache.org/viewvc?rev=745448&view=rev Log: NUTCH-687 add RAT, also check plugins Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=745448&r1=745447&r2=745448&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Wed Feb 18 09:18:07 2009 @@ -624,7 +624,9 @@ - + + +
svn commit: r745446 - in /lucene/nutch/trunk/src: java/org/apache/nutch/util/ plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/ plugin/field-boost/src/java/org/apache/nutch/indexer/fie
Author: siren Date: Wed Feb 18 09:14:29 2009 New Revision: 745446 URL: http://svn.apache.org/viewvc?rev=745446&view=rev Log: NUTCH-688 add missing headers, part 2 rest Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java lucene/nutch/trunk/src/plugin/response-json/src/java/org/apache/nutch/searcher/response/json/JSONResponseWriter.java lucene/nutch/trunk/src/plugin/response-xml/src/java/org/apache/nutch/searcher/response/xml/XMLResponseWriter.java lucene/nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?rev=745446&r1=745445&r2=745446&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java Wed Feb 18 09:14:29 2009 @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.util; import java.io.DataInput; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=745446&r1=745445&r2=745446&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Wed Feb 18 09:14:29 2009 @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.util; import java.util.Stack; Modified: lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java?rev=745446&r1=745445&r2=745446&view=diff == --- lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java (original) +++ lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java Wed Feb 18 09:14:29 2009 @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the A
svn commit: r745416 - /lucene/nutch/trunk/build.xml
Author: siren Date: Wed Feb 18 08:11:46 2009 New Revision: 745416 URL: http://svn.apache.org/viewvc?rev=745416&view=rev Log: NUTCH-687 add RAT Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=745416&r1=745415&r2=745416&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Wed Feb 18 08:11:46 2009 @@ -610,4 +610,23 @@ + + + + + + + + + + + + + + + + + +
svn commit: r745096 - in /lucene/nutch/trunk: ./ src/plugin/ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/ src/plugin/index-more/src/test/ src/plugin/index-more/src/test/org/ src/plugi
Author: siren Date: Tue Feb 17 14:28:14 2009 New Revision: 745096 URL: http://svn.apache.org/viewvc?rev=745096&view=rev Log: fix NUTCH-631 - thanks to Stefan Will Added: lucene/nutch/trunk/src/plugin/index-more/src/test/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745096&r1=745095&r2=745096&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Feb 17 14:28:14 2009 @@ -339,6 +339,9 @@ (Curtis d'Entremont, ab) 127. NUTCH-683 - NUTCH-676 broke CrawlDbMerger. (dogacan) + +128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException + (Stefan Will, siren) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=745096&r1=745095&r2=745096&view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Tue Feb 17 14:28:14 2009 @@ -93,6 +93,7 @@ + Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=745096&r1=745095&r2=745096&view=diff == --- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Tue Feb 17 14:28:14 2009 @@ -175,12 +175,31 @@ return doc; } - // Add Content-Type and its primaryType and subType + /** + * + * Add Content-Type and its primaryType and subType add contentType, + * primaryType and subType to field "type" as un-stored, indexed and + * un-tokenized, so that search results can be confined by contentType or its + * primaryType or its subType. + * + * + * For example, if contentType is application/vnd.ms-powerpoint, search can be + * done with one of the following qualifiers + * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint + * all case insensitive. The query filter is implemented in + * {...@link TypeQueryFilter}. + * + * + * @param doc + * @param data + * @param url + * @return + */ private NutchDocument addType(NutchDocument doc, ParseData data, String url) { MimeType mimeType = null; String contentType = data.getMeta(Response.CONTENT_TYPE); if (contentType == null) { - // Note by Jerome Charron on 20050415: +// Note by Jerome Charron on 20050415: // Content Type not solved by a previous plugin // Or unable to solve it... Trying to find it // Should be better to use the doc content too @@ -202,32 +221,31 @@ } contentType = mimeType.getName(); -String primaryType = mimeType.getSuperType().getName(); -String subType = mimeType.getSubTypes().first().getName(); -// leave this for future improvement -//MimeTypeParameterList parameterList = mimeType.getParameters() - -// add contentType, primaryType and subType to field "type" -// as un-stored, indexed and un-tokenized, so that search results -// can be confined by contentType or its primaryType or its subType. -// For example, if contentType is application/vnd.ms-powerpoint, -// search can be done with one of the following qualifiers -// type:application/vnd.ms-powerpoint -// type:application -// type:vnd.ms-powerpoint -// all case insensitive. -// The query filter is implemented in TypeQueryFilter.java + doc.add("type", contentType); -doc.add("type", primaryType); -doc.add("type", subType); -// add its primaryType and subType to respective fields -doc.add("primaryType", primaryType); -doc.add("subType&
svn commit: r743573 - in /lucene/nutch/trunk: site/index.html site/index.pdf src/site/src/documentation/content/xdocs/index.xml
Author: siren Date: Wed Feb 11 23:48:50 2009 New Revision: 743573 URL: http://svn.apache.org/viewvc?rev=743573&view=rev Log: fix link and name Modified: lucene/nutch/trunk/site/index.html lucene/nutch/trunk/site/index.pdf lucene/nutch/trunk/src/site/src/documentation/content/xdocs/index.xml Modified: lucene/nutch/trunk/site/index.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/index.html?rev=743573&r1=743572&r2=743573&view=diff == --- lucene/nutch/trunk/site/index.html (original) +++ lucene/nutch/trunk/site/index.html Wed Feb 11 23:48:50 2009 @@ -261,7 +261,7 @@ Lucene will be extremely well represented at - http://us.apachecon.com/c/acus2008/";>ApacheCon US 2009 + http://www.eu.apachecon.com/c/aceu2009/";>ApacheCon EU 2009 in Amsterdam, Netherlands this March 23-27, 2009: Modified: lucene/nutch/trunk/site/index.pdf URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/index.pdf?rev=743573&r1=743572&r2=743573&view=diff == --- lucene/nutch/trunk/site/index.pdf (original) +++ lucene/nutch/trunk/site/index.pdf Wed Feb 11 23:48:50 2009 @@ -157,10 +157,10 @@ >> endobj 32 0 obj -<< /Length 3367 /Filter [ /ASCII85Decode /FlateDecode ] +<< /Length 3366 /Filter [ /ASCII85Decode /FlateDecode ] >> stream -Gau`V9lo)J'#!u4...@lb,,1N,Y4P.EbrdqF_Wj5.Yl7i3&?YNjV`)*E\.+FDhB->'-JunI+ZkY#bg.&!#po8&3"f7YMY7gnrNuupK&Vb\=s"T58n&8g(GH\1eNA3A+6*,2#K(q...@bsi`gs.r##>j\GPCQh;-Ug@&5D=>kM,0&(;d1Gh;[)cdFM)Z_3W#k3Ci*2Zm^BF3aS0$\_,A50QR'l1]P8WR#kgD/*J^s8UXAo:P3J/EQMo\'sg3;nh)2[oGFG]fXir8\/jKeFWOh['JQM8,"hOQj+Ige%i7?9[P-X"tU6U56K^p(:,\)\DUcsQJ:HMl]4OSr)A[I(.5ijH/f4D5q;_V1Pbm1*"LHH#5q&@D/4E]#;hR(&pLM?WKrMp>Ig5!iABV%7I]&ei$opbQuN\T@,)qc##Q"gEQ3+D+Wt23Mr6>"m0>Hbf+3ERNs7lp68*)Af\fWNWLqju854ms$8\!Z+gb]qMBr4NldBO`.\ajWA4%UH"K,e,Sj)9/G)o[1oSSKlo9K)_i-G9m3n?S"nL>J;s(![(]EAR:VL#S1GOn%#t?51lN8YTiRX,[;YZsBu+?uK\RU_ogQL/20-FOC3nOo-j6u$?BdnS0XWaCBo8pq,R4cZR2u=WmD0;d_linEu8(>k'5o9e+&])6U(X6,9ZTj_fmr.7[;^qJG(KEtSRW;8mu6)^70_=Ocg1QdohLV"v5...@`!q.7jl>gseijh&,!4?`Y5\$-S4k;Xbnj-1u!U\.A2Z7\(5S!$s3WG[!Y;l1th^>=:[bH7eq*#mq.;/+,L)Jk5RJ7+%a:pmhWVN#RL`JQmG48IDIf1g3Q(]3H5LtIAO8=6UrMR=m6HmcE#ZM<`:)jb.=?4fYU65q+(+...@n&@X<.*_ip%a$T7h(6np%W.^ecL(FOlXA/K/oT>5VR.T'n\+geWq1VG0Z>p^fpA9jjWGrV(#...@%0mlmvdzr!x#<@Lo?t%^k,?(Y;'1^-...@ldm-\e3lcth]=kj89=;H*n0;]XNn_Y2^8dZ'UB)ZB^24nK:J+7V1R0`oLhIC`5QT,2eX3$K%CJOm&Zr+IXB%k=8bj*XF4b'3Ys%!C0%mIi`W%o43_8f@@n:;9VF/>c...@=l.g#em!n/$^R.]Wh"s,sb/^O]]X/&WHpF:,cfaWs9;a&":f...@pcro,to-]Uhm/6-=QnT\c)+b#/!)q>*Qk[B.#2s*...@0nbhfn-n\x*g"\n=Oj+n(u-nTBktH:1hh^?!6Vh]-0_?QecCbjL0-\jgSjU53>O0W2Y,y7g...@gopcnj6a]b[phx*$-q>230=&lc]56f-/[i0#F%.)9Mm'ep?[hj...@p(2iFGdfKdL)J!Th%4>QA/XYoKV3EAbErV25NE\Bg9ks$UQk&u\OTP%c/bYE\->S^dHnmqM_H3m&)6_-FZAM0hJ-V-SW Q%eiR-,&FY4\=m]]CI;nFOK7fCpAFY+^;O_PD"]6b9&bGrpi;:&-&U"a...@%q\f(5g)Cqf+VN>b7...@d1v:Y7UQ.,6ZBj0n +Gau`V9lo)J'#!u4...@lb,,i...@r,*EbrdqF_Wj5.Yl7i3&?YNjV`)*E\.+FDhB->'-JunI+ZkY#bg.&!#po8&3"f7YMY7gYV6FPm]WACiJIaMr/i+[TIjSl8E(o)3sP_16,F^r=-TcZ(Ve,5k/ZDp_;-bg...@x!\7jximsae=duo-^q.7`=i+[rc[o=)S#_$Gh$ffncT1/51__,Sq(QiMVKdiIDhLP/Nl=Zde76(g&^]Y#YYhU;'?...@p)#i2:=[2\,d%k)C\V#)VR,boH:aEB$Wf)tY6J[B8Of*md;=9&-tCVcl!aCuoOuL!u-1'u=f1Ju^k'_i0-`$KDo<0o-Janr2k"6P+r#(VsU#gCIsOXrd^.*Y!]jKO3.`Z,*3&;a7StOdZh1$[J2P+gb]qMBot6h5/YP:dX#>Vb_TOh4p5O^S9`LHmQKEG[A-4Ncbn#]ZLH(5:P]SQU&@.<,W\\DZ0p[\X4YR`L9$(neRa=`/A'W)QRhFhr%E?'#+Je+u(7(/53py6eay68fk8q\^n.p$rh...@s=mi#'g...@$ge7qpxp2,7@/\!?cOP>D^^1]I87aCa19b4iUKtt.BMm=`QVNs\09i70/4jH+erW+m_G(7nN8JYq(>o%b8rY87SNgO^0VT[DdS,JBG[Q;njG(U8!A)c[\+:J@&i:+1B#=<2H9sIlo_]$=J9),B%#[p&q$%tUU6LAHDeN+\9Ufe4cq^ssHL%k6.G1\[D)'ttYn!R&$m1LlY`Sf-C[li+q,LK5,a8W>[lY:Y#G(XnTW&^i?On`M?&;^ER,7#O!M-GlJ*u q<7i[H<7Dl_kD))O;+cmhi#rjl*=edto6?1br\=...@3,,jHFR!6a!iJ-f]#:OOP4IS&j7Q4J.Qg/'WP1$^q:aZpd&f\@,]H!Pbq)\KN;t*)2r...@#x8isuq`viao"?.H:h!ag#es8$yp9m-...@!+a]t?`e/eAH]=Rc"4oln[ys?ed3$h_fp\ciol^^pa?#hap...@ah==7;9MqC0^&*tZ>?.d...@bpdzf8^ffxgo5pi:7e`MUR2TEQI/,iz=jb$p12o-4...@e,gk84m0VL!2CK%>0:2]f;n^go%0$OU5,)ljr...@j5i%engqkk81ryn*$bbg%bV=eRrbpl59WeHb3gDo,\!d%#^[4jNaFnt64'B4%iI*3rH]MgSf,9c=_Fel]ZI&c#epE[6C8=!\i=h"9NC[tb1;hO#-b+r,*>m]WSYotk)JqB2d(j&WS[UcnRF/d...@xs2k3+rc%o!%_inrj=%q<_0.&3KedU"e[::HX39M"a+M-GX>Bh8u9IpR[+10$=nZ>lHh
svn commit: r743464 - in /lucene/nutch/trunk: site/index.html site/index.pdf src/site/src/documentation/content/xdocs/index.xml
Author: siren Date: Wed Feb 11 19:43:26 2009 New Revision: 743464 URL: http://svn.apache.org/viewvc?rev=743464&view=rev Log: add apachecon promo Modified: lucene/nutch/trunk/site/index.html lucene/nutch/trunk/site/index.pdf lucene/nutch/trunk/src/site/src/documentation/content/xdocs/index.xml Modified: lucene/nutch/trunk/site/index.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/index.html?rev=743464&r1=743463&r2=743464&view=diff == --- lucene/nutch/trunk/site/index.html (original) +++ lucene/nutch/trunk/site/index.html Wed Feb 11 19:43:26 2009 @@ -209,6 +209,10 @@ News +09 February 2009 - Lucene at ApacheCon Europe 2009 in + Amsterdam + + 2 April 2007: Nutch 0.9 Released @@ -247,7 +251,50 @@ News - + +09 February 2009 - Lucene at ApacheCon Europe 2009 in + Amsterdam + + +http://www.eu.apachecon.com/c/aceu2009/"; title="ApacheCon EU 2009"> + http://www.eu.apachecon.com/page_attachments//0115/125x125_basic.gif";> + + + Lucene will be extremely well represented at + http://us.apachecon.com/c/acus2008/";>ApacheCon US 2009 + in Amsterdam, Netherlands this March 23-27, 2009: + + + + + +http://eu.apachecon.com/c/aceu2009/sessions/197";>Lucene Boot Camp + - A two day training session, March 23 & 24th + + +http://eu.apachecon.com/c/aceu2009/sessions/201";>Solr Boot Camp - A one day training session, March 24th + + +http://eu.apachecon.com/c/aceu2009/sessions/136";>Introducing Apache Mahout - Grant Ingersoll. March 25th @ 10:30 + + +http://eu.apachecon.com/c/aceu2009/sessions/137";>Lucene/Solr Case Studies - Erik Hatcher. March 25th @ 11:30 + + +http://eu.apachecon.com/c/aceu2009/sessions/138";>Advanced Indexing Techniques with Apache Lucene - Michael Busch. March 25th @ 14:00 + + +http://eu.apachecon.com/c/aceu2009/sessions/251";>Apache Solr - A Case Study - Uri Boness. March 26th @ 17:30 + + +http://eu.apachecon.com/c/aceu2009/sessions/250";>Best of breed - httpd, forrest, solr and droids - Thorsten Scherler. March 27th @ 17:30 + + +http://eu.apachecon.com/c/aceu2009/sessions/165";>Apache Droids - an intelligent standalone robot framework - Thorsten Scherler. March 26th @ 15:00 + + + + 2 April 2007: Nutch 0.9 Released The 0.9 release of Nutch is now available. This is the second release of Nutch based entirely on the underlying Hadoop platform. This release includes several critical @@ -256,41 +303,41 @@ See http://www.apache.org/dist/lucene/nutch/CHANGES-0.9.txt";> list of changes made in this version. The release is available http://lucene.apache.org/nutch/release/";>here. - + 24 September 2006: Nutch 0.8.1 Released The 0.8.1 release of Nutch is now available. This is a maintenance release to 0.8 branch fixing many serous bugs found in version 0.8. See http://www.apache.org/dist/lucene/nutch/CHANGES-0.8.1.txt";> list of changes made in this version. The release is available http://lucene.apache.org/nutch/release/";>here. - + 25 July 2006: Nutch 0.8 Released The 0.8 release of Nutch is now available. This is the first release of Nutch based on hadoop architecure. See http://svn.apache.org/viewvc/lucene/nutch/tags/release-0.8/CHANGES.txt?view=markup";> CHANGES.txt for list of changes made in this version. The release is available http://lucene.apache.org/nutch/release/";>here. - + 31 March 2006: Nutch 0.7.2 Released The 0.7.2 release of Nutch is now available. This is a bug fix release for 0.7 branch. See http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=390158";> CHANGES.txt for details. The release is available http://lucene.apache.org/nutch/release/";>here. - + 1 October 2005: Nutch 0.7.1 Released The 0.7.1 release of Nutch is now available. This is a bug fix release. See http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=292986";> CHANGES.txt for details. The release is available http://lucene.apache.org/nutch/release/";>here. - + 17 August 2005: Nutch 0.7 Released This is the first Nutch release as an Apache Lucene sub-project. See http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/CHANGES.txt?rev=233150";> CHANGES.txt for details. The release is available
svn commit: r733014 - /lucene/nutch/trunk/site/doap.rdf
Author: siren Date: Fri Jan 9 03:57:01 2009 New Revision: 733014 URL: http://svn.apache.org/viewvc?rev=733014&view=rev Log: add missing releases Modified: lucene/nutch/trunk/site/doap.rdf Modified: lucene/nutch/trunk/site/doap.rdf URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/doap.rdf?rev=733014&r1=733013&r2=733014&view=diff == --- lucene/nutch/trunk/site/doap.rdf (original) +++ lucene/nutch/trunk/site/doap.rdf Fri Jan 9 03:57:01 2009 @@ -33,6 +33,30 @@ http://wiki.apache.org/nutch/"/> +branch-0.9 +nutch-0.9 +2007-04-01 +0.9 + + + + +branch-0.8 +nutch-0.8.1 +2006-09-24 +0.8.1 + + + + +branch-0.8 +nutch-0.8 +2006-06-25 +0.8 + + + + branch-0.7 nutch-0.7.2 2006-03-31
svn commit: r613378 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/searcher/ src/java/org
Author: siren Date: Sat Jan 19 00:59:29 2008 New Revision: 613378 URL: http://svn.apache.org/viewvc?rev=613378&view=rev Log: NUTCH-580 Remove deprecated hadoop api calls (FS) Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=613378&r1=613377&r2=613378&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Jan 19 00:59:29 2008 @@ -191,6 +191,8 @@ 66. NUTCH-584 - urls missing from fetchlist (Ruslan Ermilov, ab) +67. NUTCH-580 - Remove deprecated hadoop api calls (FS) (siren) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=613378&r1=613377&r2=613378&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Sat Jan 19 00:59:29 2008 @@ -32,6 +32,7 @@ import org.apache.nutch.indexer.DeleteDuplicates; import org.apache.nutch.indexer.IndexMerger; import org.apache.nutch.indexer.Indexer; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -131,9 +132,9 @@ linkDbTool.invert(linkDb, segments, true, true, false); // invert links // index, dedup & merge - indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments)); + indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments, HadoopFSUtil.getPassAllFilter())); dedup.dedup(new Path[] { indexes }); - merger.merge(fs.listPaths(indexes), index, tmpDir); + merger.merge(fs.listPaths(indexes, HadoopFSUtil.getPassAllFilter()), index, tmpDir); } else { LOG.warn("No URLs to fetch - check your seed list and URL filters."); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=613378&r1=613377&r2=613378&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Sat Jan 19 00:59:29 2008 @@ -31,6 +31,7 @@ import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.ToolBase; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -181,15 +182,7 @@ } else if (args[i].equals("-noAdditions")) { additionsAllowed = false; } else if (args[i].equals("-dir")) { -Path[] paths = fs.listPaths(new Path(args[++i]), new PathFilter() { - public boolean accept(Path dir) { -try { - return fs.isDirectory(dir); -} catch (IOException ioe) { - return false; -} - } -}); +Path[] paths = fs.listPaths(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); dirs.addAll(Arrays.asList(paths)); } else { dirs.add(new Path(args[i])); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=613378&r1=613377&r2=613378&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Sat Ja
svn commit: r546998 - /lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java
Author: siren Date: Wed Jun 13 11:50:14 2007 New Revision: 546998 URL: http://svn.apache.org/viewvc?view=rev&rev=546998 Log: remove debug iteration because it seems to block spelling suggestions Modified: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java Modified: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java?view=diff&rev=546998&r1=546997&r2=546998 == --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java (original) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java Wed Jun 13 11:50:14 2007 @@ -150,14 +150,6 @@ , originalTerm, ng1, ng2, maxr, bStart, bEnd, bTransposition, maxd, lis, true); -Iterator it = lis.iterator(); - -while (it.hasNext()) { - if(LOG.isDebugEnabled()){ -LOG.debug(it.next().toString()); - } -} - if (suggestions.length > 0) { currentTerm.setSuggestedTerm(suggestions[0]);
svn commit: r538273 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
Author: siren Date: Tue May 15 11:29:49 2007 New Revision: 538273 URL: http://svn.apache.org/viewvc?view=rev&rev=538273 Log: NUTCH-161 Change Plain text parser to use parser.character.encoding.default property for fall back encoding spotted by KuroSaka TeruHiko Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=538273&r1=538272&r2=538273 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue May 15 11:29:49 2007 @@ -19,6 +19,10 @@ 7. NUTCH-483 - Remove redundant commons-logging jar from ontology plugin (siren) + + 8. NUTCH-161 - Change Plain text parser to +use parser.character.encoding.default property for fall back encoding +(KuroSaka TeruHiko, siren) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?view=diff&rev=538273&r1=538272&r2=538273 == --- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Tue May 15 11:29:49 2007 @@ -24,35 +24,42 @@ import org.apache.hadoop.conf.Configuration; public class TextParser implements Parser { + private Configuration conf; + + /** + * Encoding to be used when character set isn't specified + * as HTTP header. + */ + private String defaultEncoding; + /** + * Parses plain text document. This code uses configured default encoding + * [EMAIL PROTECTED] parser.character.encoding.default} if character set isn't specified + * as HTTP header. FIXME: implement charset detector + */ public ParseResult getParse(Content content) { -// ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new -// Outlink[0], metadata); - String encoding = StringUtil.parseCharacterEncoding(content .getContentType()); String text; -if (encoding != null) { // found an encoding header - try { // try to use named encoding -text = new String(content.getContent(), encoding); - } catch (java.io.UnsupportedEncodingException e) { -return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); - } -} else { - // FIXME: implement charset detector. This code causes problem when - // character set isn't specified in HTTP header. - text = new String(content.getContent()); // use default encoding +try { + text = new String(content.getContent(), encoding != null ? encoding + : defaultEncoding); +} catch (java.io.UnsupportedEncodingException e) { + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); } + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata()); parseData.setConf(this.conf); return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); - } public void setConf(Configuration conf) { +defaultEncoding = conf.get("parser.character.encoding.default", +"windows-1252"); this.conf = conf; }
svn commit: r537915 - /lucene/nutch/dist/KEYS
Author: siren Date: Mon May 14 09:58:17 2007 New Revision: 537915 URL: http://svn.apache.org/viewvc?view=rev&rev=537915 Log: update my key Modified: lucene/nutch/dist/KEYS Modified: lucene/nutch/dist/KEYS URL: http://svn.apache.org/viewvc/lucene/nutch/dist/KEYS?view=diff&rev=537915&r1=537914&r2=537915 == --- lucene/nutch/dist/KEYS (original) +++ lucene/nutch/dist/KEYS Mon May 14 09:58:17 2007 @@ -1,3 +1,17 @@ +This file contains the PGP keys of various developers. +Please don't use them for email unless you have to. Their main +purpose is code signing. + +Examples of importing this file in your keystore: + gpg --import KEYS.txt + (need pgp and other examples here) + +Examples of adding your key to this file: + pgp -kxa and append it to this file. + (pgpk -ll && pgpk -xa ) >> this file. + (gpg --list-sigs + && gpg --armor --export ) >> this file. + pub 1024D/A7239D59 2005-10-12 Key fingerprint = 4B96 409A 098D BD51 1DF2 BC18 DBAF 69BE A723 9D59 uid Doug Cutting (Lucene guy) <[EMAIL PROTECTED]> @@ -73,11 +87,27 @@ pub 1024D/0B7E6CFA 2006-07-06 uid Sami Siren <[EMAIL PROTECTED]> sig 30B7E6CFA 2006-07-06 Sami Siren <[EMAIL PROTECTED]> +sig E222DE4F 2007-05-02 Mathias Herberts <[EMAIL PROTECTED]> +sig 911203E4 2007-05-02 Mathias Herberts <[EMAIL PROTECTED]> +sig 302DA568 2007-05-03 Rodent of Unusual Size (DSA) <[EMAIL PROTECTED]> +sig 2C312D2F 2007-05-03 Rodent of Unusual Size (DSS) <[EMAIL PROTECTED]> +sig F12F6072 2007-05-05 Fred Vos <[EMAIL PROTECTED]> +sig 3990ED4AA 2007-05-02 Knut Anders Hatlen <[EMAIL PROTECTED]> +sig 3311A3DE5 2007-05-05 Ruediger Pluem <[EMAIL PROTECTED]> +sig A99F75DD 2007-05-03 Rodent of Unusual Size <[EMAIL PROTECTED]> +sig 5F298824 2007-05-06 Simon Pepping <[EMAIL PROTECTED]> +sig 4358C584 2007-05-06 Vincent Hennebert <[EMAIL PROTECTED]> +sig 4CEED75F 2007-05-07 Nick Burch <[EMAIL PROTECTED]> +sig C874155C 2007-05-07 Thilo Goetz (home key) <[EMAIL PROTECTED]> +sig 388817402 2007-05-06 Thomas Vandahl <[EMAIL PROTECTED]> +sig 01530235 2007-05-02 Luc Maisonobe (general purpose) <[EMAIL PROTECTED]> +sig 40581837 2007-05-08 Nick Kew <[EMAIL PROTECTED]> +sig 5F6B8B72 2007-05-12 Stefan Bodewig <[EMAIL PROTECTED]> sub 2048g/A3A3EC3F 2006-07-06 sig 0B7E6CFA 2006-07-06 Sami Siren <[EMAIL PROTECTED]> -BEGIN PGP PUBLIC KEY BLOCK- -Version: GnuPG v1.4.4 (GNU/Linux) +Version: GnuPG v1.4.7 (GNU/Linux) mQGiBESs8FMRBADhMg5ONjSVuSVJoYbOL8vvoygjO9qH/MS21Ue2Hx2qLf8xB1/W baVL5kEH0ixkeg6H+qO4gGpyJ/cdww0v0CjbxRZw2R2QP1PtpZgioGv4YYNstUis @@ -90,21 +120,46 @@ GmZ8Q7LjYOnDyNIh+igVifkrlUlNKh3k8BVEXsH1OxffO28LzLQdU2FtaSBTaXJl biA8c2lyZW5AYXBhY2hlLm9yZz6IYAQTEQIAIAUCRKzwUwIbAwYLCQgHAwIEFQII AwQWAgMBAh4BAheAAAoJEAKlpgULfmz6vl0An0KCSRbIZjNFyQoDTR7Y/21tw94h -AJ93zAzfB8woj0MuqiOtUZ29OX/m+7kCDQRErPBsEAgAxjiL5UbpPeA/k2P1QjtL -Af/JTqG4lN6kBbRbvBbAOAYI0PYuskdsCxImdAopeJFnOm9fU0gGq4aggCeBlZhi -GiSN865Gm/RwwA5Jbtl0hbE5ZcczhaF7iSsKEwrui1ATciYy432ZH28HpWViZkBP -zJedwptd9uIrzSWa6OKB+xNLvPrYMmSPHvp6CRPRKyES71IpgXmw8Udy88q5PkMd -xM0LKPANv68DPx1IWRAiZGTt3/0zJr9lxW3R4waIvF1rB549VPcLl/Z1kXnVrz4B -7SuMMMtzDJsFD7F03K9jxYcIB+TySmQh7C77uFz3vH4XwviBtrvEi2rh7mGW7gwd -KwADBggAr04EsfuSET1+BTmVhC7yp+Dy/NE4kzd18I4L4VPd2vhD2y0BrVFK4Q45 -TJV2JQvMZH/rIj0jRVMC7cKTwm2P0igf/7rxw/yvO8DjCYVVwI1zatg5lSUiNxDo -h2O1g1co9GQATbdMg4YcT3ih6TgPyy10Vpq1D1yzWE7Sd8bllJY5iveK177QQ1IF -WtrKv4T2TCdTEtt8lkPHvvQ/Ooc55eGg75DOUe/7JSHdW0xht1sqerEoFQd1M7hY -5ss0MG+qnMOraqhTe54R8Le8zDyxh+AukIeo4PuyPSdMyoJAxcJq0YrItNI54sAc -50PIIfr07ho0pWWiqxWWmq47IPa5e4hJBBgRAgAJBQJErPBsAhsMAAoJEAKlpgUL -fmz6o9sAn2llnxCq/ZxaVT1252/g1IjCcJGIAKCgpJFo4pxVT8zCfPzWLnsBu7dL -Nw== -=nf3j +AJ93zAzfB8woj0MuqiOtUZ29OX/m+4hGBBARAgAGBQJGOOPMAAoJEBVFs/7iIt5P +au8AoJBhBjsv6RD1sYBsfhbaBsZyaENHAJoD/2IhBBMaBV3fNsTCbQilad2YAIhG +BBARAgAGBQJGOOh1AAoJEGPQra6REgPkOL8An3TWNp6bYNIRwWRKUYsCEHWQ4BMu +AJ9ix4bnBUf6R33seqwNz7Gp7z8eDohGBBARAgAGBQJGOghLAAoJEFCOrsUwLaVo +azcAoIaOwIwqXgW+4xZ7GYPfJEFrHGPbAKCG+gIdVX4NmNaxERZPj2qLycs9w4hG +BBARAgAGBQJGOghLAAoJEN26ZLosMS0vazcAoNgtdYT1uCNRLTdGaYhPvjGfVr04 +AJ9MhDZ+LUm6/+k783wrph1mRU2iY4hGBBARAgAGBQJGPMSSAAoJEJhw7/PxL2By +uIoAnjog0y6x+vqOqJV+AWDbM99ZrOH+AKCC4u8eDndLGM9XwAp5Tl7jVr5oqIhG +BBMRAgAGBQJGORnaAAoJEOHh8rCZDtSqY8YAn0n0/gjvZKp7/bwoIj9T7jBkjpbZ +AJ40MSZ32QBcRnt2vP6vK7/SXpshE4hGBBMRAgAGBQJGPE+UAAoJEEwEKBgxGj3l +9OQAn145exS7RQZNTU8+BjBzSmRBGL9BAKCJ66ln9ObH2GwEHEhlS0fhEaJAD4ic +BBABAgAGBQJGOghLAAoJEJrNPMCpn3XdE8kD+wQBy+g+4TS8IVraka2wfibUpuqo +6UdRXiOO0CUWGBNq1jPE7LthT7tSf76Scfk7p2OiG0DfmkCBhi6hD1TgESOUOuG6 +QJM/VTwNg8KwvKXMgEd0drh/waktIIZoo/PS+LGYsyiLEKk43FL86v
svn commit: r537872 - in /lucene/nutch/dist: HEADER.html KEYS
Author: siren Date: Mon May 14 08:15:34 2007 New Revision: 537872 URL: http://svn.apache.org/viewvc?view=rev&rev=537872 Log: NUTCH-457 Create top level dist directory and checkin KEYS file to subversion Added: lucene/nutch/dist/HEADER.html lucene/nutch/dist/KEYS Added: lucene/nutch/dist/HEADER.html URL: http://svn.apache.org/viewvc/lucene/nutch/dist/HEADER.html?view=auto&rev=537872 == --- lucene/nutch/dist/HEADER.html (added) +++ lucene/nutch/dist/HEADER.html Mon May 14 08:15:34 2007 @@ -0,0 +1,9 @@ +http://lucene.apache.org/nutch/";>Nutch Releases + +Please make sure you're downloading from http://www.apache.org/dyn/closer.cgi/lucene/nutch/";>a nearby +mirror site, not from www.apache.org. + +For current development versions, see the Nutch http://people.apache.org/builds/lucene/nutch/nightly/";>nightly +build directory. Added: lucene/nutch/dist/KEYS URL: http://svn.apache.org/viewvc/lucene/nutch/dist/KEYS?view=auto&rev=537872 == --- lucene/nutch/dist/KEYS (added) +++ lucene/nutch/dist/KEYS Mon May 14 08:15:34 2007 @@ -0,0 +1,187 @@ +pub 1024D/A7239D59 2005-10-12 + Key fingerprint = 4B96 409A 098D BD51 1DF2 BC18 DBAF 69BE A723 9D59 +uid Doug Cutting (Lucene guy) <[EMAIL PROTECTED]> +sig 3A7239D59 2005-10-12 Doug Cutting (Lucene guy) <[EMAIL PROTECTED]> +sub 2048g/ADDE5978 2005-10-12 +sig A7239D59 2005-10-12 Doug Cutting (Lucene guy) <[EMAIL PROTECTED]> + +-BEGIN PGP PUBLIC KEY BLOCK- +Version: GnuPG v1.4.1 (FreeBSD) + +mQGiBENNR5oRBAC2ZzxD2fXYht8qkfT/6tjWJxLG4KH2dLEWSYEzku8ZtJ7eA6X7 +/hcvZdhjGH0aA6MAEVSxh6LO1hmRARE2e2Br68j4TjwbQ0J5BOgkMMAArmQe7w6B +RjKUI3H74Qbfjuk4Ebf1fNkRkpwuw+JxZu5pqpACqwv6nPhcSDDjbuA/1wCgj+++ +uxVSQMF4Xrd0hApOSYGHL8kD/jCU+vM3ILuFVTCgfC5RehmqwQo/f6KEv99jJSxX +ClcksiLquOH8vMc3MV1YWOe4u93DI7iAYzCylS1s2Wn0bLEBrbdGKLMH4hSSMDRC +pjnyvzvnEMhMU+Jn3LK6lQw4nHH+aDGFcYZ2pQen7JAcYz7l6QeTsvMnRV+v13K1 +/zRjA/9QUxrgg2N5WQnEhMegIWBKVhxQV6a2mSfeNd0ApxzdqdoHZNkUD+pKMB0F +oQ9aP55KbtvFosurFgEmvwLIoMnQohxjIhdk0Hx3xMT17CtYl04F0C+QNxeXpWr7 +/B0kq8nALn17hXz5A1bFaiMHX86QmvNyMTDUC2VrVbkV251dlLQuRG91ZyBDdXR0 +aW5nIChMdWNlbmUgZ3V5KSA8Y3V0dGluZ0BhcGFjaGUub3JnPoheBBMRAgAeBQJD +TUeaAhsDBgsJCAcDAgMVAgMDFgIBAh4BAheAAAoJENuvab6nI51ZjRAAoIZ96gYE +f8QCDpXkBQqtNgRiF4t5AJ9JKMrN/Ow+Kyl75FU9U2KWyPoMk7kCDQRDTUejEAgA +m3UdcglfOdgqI7Z9XUX38yqiFzNozSvTdOt3j6evIVvjJ3e0P87tUQlrdsbMcaXd ++PAc7EA5LE0eJlE9jR1/18tsIlYi/n1hxz1lWtaZ+9he3yTB12QmAf4MMTXaRBkI +ZqwdwZxmL5V+2TmhFT2bIzPLgrMHNsA4dtQuBak41GC+VXovqitS9Xzse2Ki+U9u +SiRPsD7x5DcgJm9sg/zqCNrvDN8vOC8iHa/CIqsZr3xaPgfQLZp6Xk3doHLc6IJ9 +6knDAZvzJFgfj8MGCQoOExE/1XoNGTWcgoiy0D30ADG+rtIbaRT8tdQ6m19/ytqd +Zm7ibB7b78/pyfvvcB5tKwADBgf9GwdUdHUPjezlFpcCI/K3XHKdPLi00HJ2L1O8 +5pErBjDyZ5ey7vAMuYB5O31dB7pncSVsTdt9RRQHS+iLrv9aJjvYhV4yQU0ADkgC +9qEvxm7wpn76AT+Z1LIay/vNoQPxnfWq+uZD/Lnku1VcnMZ5teSG6uJzApBGYsgN +xpPPsobKKvclZdhO5NhhZLFZ0taWh4pna2jpDTLmyRa4kO7p7rIixsKxFfLUUc33 +2RqBomnm9eRlSvC4BBCq6M7YPLG0Rv5WmzuuWpc865EaMoBEtwPQBb4+qcMN69Lp +3x6EaymTWmHx1o8aUjAxhORE/miy53eGPzIXY+csjMyAmSxDG4hJBBgRAgAJBQJD +TUejAhsMAAoJENuvab6nI51ZlTIAn0oHlUPw+v1gVUJ8D2Nu26knOqJKAJ4spe/k +Sc2xRlsNP3tZiO+jYMAFSg== +=goQx +-END PGP PUBLIC KEY BLOCK- +pub 1024D/7C491924 2006-03-30 Piotr Kosiorowski <[EMAIL PROTECTED]> +sig 3 7C491924 2006-03-30 Piotr Kosiorowski <[EMAIL PROTECTED]> +sub 2048g/4A70BB35 2006-03-30 +sig 7C491924 2006-03-30 Piotr Kosiorowski <[EMAIL PROTECTED]> + +-BEGIN PGP PUBLIC KEY BLOCK- +Version: GnuPG v1.2.7 (GNU/Linux) + +mQGiBEQrfF8RBACblz5gaIolsKdJgtdy913C+k/QXvaeg3R+8dXXkgVgC5vvRbUk +Ei6UBRMU5H0cNE76d0XlMYP1MccqdowsfPfWxl04VViW6p+KHmBa2ICIWvq1PQXL +XhocuRZn6dzfnxcFjsJlsKXtX+okzL9rc1AHiPsb+14XFQtd0/uxs/qeswCgyM68 +hxpwMZU4U0Q7yYkB8usVjbcD/iC65v+8DPhVgxp4o66JJqTYkBZ73mS4f/DDlQsL +9qCj8h9rLYHmV85hSx3pBBDuz/HjIzu5ruj+l78H++WISXE82hj++OS0bpKnb+nV +x/iN+b/Y0W0CzMEms+42LcNz1azvLL6ZBgLwnUePT1mBnOy6UgFW1XZGow/XO4Lr +2py7BACP2WrV+rAzc8RcelmvE3eaAj0DJhAl2Brkdl7B4KDNpBTYZM2TaJ5G5pDK +EhzH3O6IZP4dRh4iEipl+qcJ0eC5OlKHxqyXXbQYH5jzqkl+4cAQRkCliWuFrGcO +o3XaOFE54dpY8FZbineEJLrg4Ynh592gO731IcP4gm401ORGv7QrUGlvdHIgS29z +aW9yb3dza2kgPHBrb3Npb3Jvd3NraUBhcGFjaGUub3JnPoheBBMRAgAeBQJEK3xf +AhsDBgsJCAcDAgMVAgMDFgIBAh4BAheAAAoJEEsO4ix8SRkkWLwAoLrn6dtn38yI +8dja2k2lJJ7PVpOoAJ9qZO+QfOfJRf1H+1L6qOuviiDkR7kCDQREK3xpEAgAklbu +2ctaceFu6nolNd3cnKNqDNppvSRSwDzZZytXjzV10E5VW7fYlN1+huOSV9nRLAIL +stNloFiOdQGElT0t8Xi9N9X1BuzSkxWMKqDHaTOSnKNupCuDzz9F3oYXVMbLwZBG +GJAMezd6WuCl+KyhsJgt0GD/H2Ucyck2CqTQRZFPOPOPB2urZbmw8F5bTI3u9J1Q +ElwApNTrHS04HyNEq5o9j/iTMvvunnkliQFI0Z/flvfHaV6go3/ZhMeVkLU7m/mq +bPh467HN0MTN5O+znak164nBumxcqD8yUF5TiWD42dykNffbN2ajZzgVvTxWerVV +mqVMTetbhl3Hoaff0wADBQf/d+XRxh7etS3IO5Jvv85de9QvQPFm5JZpnTNfdnil +b9G3WRjZIsdmAG2khtJNmlUMUegK0ej6jsCFmsWTqg8cbCG7TBcYySWKSTGklELu +N69g9VaG60GUX6EOoEmfRMr
svn commit: r537869 - /lucene/nutch/dist/
Author: siren Date: Mon May 14 08:05:09 2007 New Revision: 537869 URL: http://svn.apache.org/viewvc?view=rev&rev=537869 Log: (empty) Added: lucene/nutch/dist/
svn commit: r537860 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt src/plugin/ontology/lib/commons-logging-1.0.3.jar
Author: siren Date: Mon May 14 07:51:59 2007 New Revision: 537860 URL: http://svn.apache.org/viewvc?view=rev&rev=537860 Log: NUTCH-483 Remove redundant commons-logging jar from ontology plugin Removed: lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=537860&r1=537859&r2=537860 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon May 14 07:51:59 2007 @@ -16,6 +16,9 @@ bots in robots.txt (Dogacan Guney via siren) 6. NUTCH-482 - Remove redundant plugin lib-log4j (siren) + + 7. NUTCH-483 - Remove redundant commons-logging jar from ontology plugin +(siren) Release 0.9 - 2007-04-02
svn commit: r537857 - in /lucene/nutch/trunk: ./ src/plugin/ src/plugin/clustering-carrot2/ src/plugin/lib-log4j/ src/plugin/parse-pdf/ src/plugin/parse-rss/
Author: siren Date: Mon May 14 07:37:27 2007 New Revision: 537857 URL: http://svn.apache.org/viewvc?view=rev&rev=537857 Log: NUTCH-482 Remove redundant plugin lib-log4j Removed: lucene/nutch/trunk/src/plugin/lib-log4j/ Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml lucene/nutch/trunk/src/plugin/parse-pdf/build.xml lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml lucene/nutch/trunk/src/plugin/parse-rss/build.xml lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=537857&r1=537856&r2=537857 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon May 14 07:37:27 2007 @@ -14,7 +14,9 @@ 5. NUTCH-446 - RobotRulesParser should ignore Crawl-delay values of other bots in robots.txt (Dogacan Guney via siren) - + + 6. NUTCH-482 - Remove redundant plugin lib-log4j (siren) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=537857&r1=537856&r2=537857 == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Mon May 14 07:37:27 2007 @@ -33,7 +33,6 @@ - @@ -122,7 +121,6 @@ - Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml?view=diff&rev=537857&r1=537856&r2=537857 == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml Mon May 14 07:37:27 2007 @@ -21,21 +21,18 @@ - - - Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?view=diff&rev=537857&r1=537856&r2=537857 == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Mon May 14 07:37:27 2007 @@ -40,7 +40,6 @@ - http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/build.xml?view=diff&rev=537857&r1=537856&r2=537857 == --- lucene/nutch/trunk/src/plugin/parse-pdf/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/build.xml Mon May 14 07:37:27 2007 @@ -19,21 +19,8 @@ - - - - - - - - - - - - - Modified: lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml?view=diff&rev=537857&r1=537856&r2=537857 == --- lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml Mon May 14 07:37:27 2007 @@ -27,12 +27,10 @@ - - http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/build.xml?view=diff&rev=537857&r1=537856&r2=537857 == --- lucene/nutch/trunk/src/plugin/parse-rss/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-rss/build.xml Mon May 14 07:37:27 2007 @@ -22,21 +22,18 @@ - - - Modified: lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml?view=diff&rev=537857&r1=537856&r2=537857 == --- lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Mon May 14 07:37:27 2007 @@ -33,7 +33,6 @@ -
svn commit: r537591 - in /lucene/nutch/trunk: site/ src/site/src/documentation/content/xdocs/
Author: siren Date: Sun May 13 07:51:20 2007 New Revision: 537591 URL: http://svn.apache.org/viewvc?view=rev&rev=537591 Log: NUTCH-484 fix link to javadoc contributed by Gal Nitzan Modified: lucene/nutch/trunk/site/about.html lucene/nutch/trunk/site/bot.html lucene/nutch/trunk/site/credits.html lucene/nutch/trunk/site/i18n.html lucene/nutch/trunk/site/index.html lucene/nutch/trunk/site/issue_tracking.html lucene/nutch/trunk/site/linkmap.html lucene/nutch/trunk/site/mailing_lists.html lucene/nutch/trunk/site/nightly.html lucene/nutch/trunk/site/tutorial.html lucene/nutch/trunk/site/tutorial8.html lucene/nutch/trunk/site/version_control.html lucene/nutch/trunk/src/site/src/documentation/content/xdocs/site.xml Modified: lucene/nutch/trunk/site/about.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/about.html?view=diff&rev=537591&r1=537590&r2=537591 == --- lucene/nutch/trunk/site/about.html (original) +++ lucene/nutch/trunk/site/about.html Sun May 13 07:51:20 2007 @@ -99,7 +99,7 @@ API Docs (0.8.x) -http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API Docs (nightly) +http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API Docs (nightly) Resources Modified: lucene/nutch/trunk/site/bot.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/bot.html?view=diff&rev=537591&r1=537590&r2=537591 == --- lucene/nutch/trunk/site/bot.html (original) +++ lucene/nutch/trunk/site/bot.html Sun May 13 07:51:20 2007 @@ -99,7 +99,7 @@ API Docs (0.8.x) -http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API Docs (nightly) +http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API Docs (nightly) Resources Modified: lucene/nutch/trunk/site/credits.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.html?view=diff&rev=537591&r1=537590&r2=537591 == --- lucene/nutch/trunk/site/credits.html (original) +++ lucene/nutch/trunk/site/credits.html Sun May 13 07:51:20 2007 @@ -99,7 +99,7 @@ API Docs (0.8.x) -http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API Docs (nightly) +http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API Docs (nightly) Resources Modified: lucene/nutch/trunk/site/i18n.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/i18n.html?view=diff&rev=537591&r1=537590&r2=537591 == --- lucene/nutch/trunk/site/i18n.html (original) +++ lucene/nutch/trunk/site/i18n.html Sun May 13 07:51:20 2007 @@ -99,7 +99,7 @@ API Docs (0.8.x) -http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API Docs (nightly) +http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API Docs (nightly) Resources Modified: lucene/nutch/trunk/site/index.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/index.html?view=diff&rev=537591&r1=537590&r2=537591 == --- lucene/nutch/trunk/site/index.html (original) +++ lucene/nutch/trunk/site/index.html Sun May 13 07:51:20 2007 @@ -99,7 +99,7 @@ API Docs (0.8.x) -http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API Docs (nightly) +http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API Docs (nightly) Resources Modified: lucene/nutch/trunk/site/issue_tracking.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/issue_tracking.html?view=diff&rev=537591&r1=537590&r2=537591 == --- lucene/nutch/trunk/site/issue_tracking.html (original) +++ lucene/nutch/trunk/site/issue_tracking.html Sun May 13 07:51:20 2007 @@ -99,7 +99,7 @@ API Docs (0.8.x) -http://lucene.apache.org/nutch/nutch-nightly/docs/api/index.html";>API Docs (nightly) +http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html";>API Docs (nightly) Resources Modified: lucene/nutch/trunk/site/linkmap.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/linkmap.html?view=diff&rev=537591&r1=537590&r2=537591 == --- lucene/nutch/trunk/site/linkmap.html (original) +++ lucene/nutch/trunk/site/linkmap.html Sun M
svn commit: r536925 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java src/plugin/lib-http/src/test/org/apache/nutch/protocol/h
Author: siren Date: Thu May 10 09:29:51 2007 New Revision: 536925 URL: http://svn.apache.org/viewvc?view=rev&rev=536925 Log: NUTCH-446 RobotRulesParser should ignore Crawl-delay values of other bots in robots.txt, contributed by Doğacan Güney Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536925&r1=536924&r2=536925 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu May 10 09:29:51 2007 @@ -11,6 +11,9 @@ (Eelco Lempsink via ab) 4. NUTCH-456 - Parse msexcel plugin speedup (Heiko Dietze via siren) + + 5. NUTCH-446 - RobotRulesParser should ignore Crawl-delay values of other +bots in robots.txt (Dogacan Guney via siren) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=536925&r1=536924&r2=536925 == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Thu May 10 09:29:51 2007 @@ -389,15 +389,17 @@ } else if ( (line.length() >= 12) && (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) { doneAgents = true; -long crawlDelay = -1; -String delay = line.substring("Crawl-Delay:".length(), line.length()).trim(); -if (delay.length() > 0) { - try { -crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec - } catch (Exception e) { -LOG.info("can not parse Crawl-Delay:" + e.toString()); +if (addRules) { + long crawlDelay = -1; + String delay = line.substring("Crawl-Delay:".length(), line.length()).trim(); + if (delay.length() > 0) { +try { + crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec +} catch (Exception e) { + LOG.info("can not parse Crawl-Delay:" + e.toString()); +} +currentRules.setCrawlDelay(crawlDelay); } - currentRules.setCrawlDelay(crawlDelay); } } } @@ -500,7 +502,7 @@ /** command-line main for testing */ public static void main(String[] argv) { -if (argv.length != 3) { +if (argv.length < 3) { System.out.println("Usage:"); System.out.println(" java +"); System.out.println(""); @@ -513,7 +515,7 @@ try { FileInputStream robotsIn= new FileInputStream(argv[0]); LineNumberReader testsIn= new LineNumberReader(new FileReader(argv[1])); - String[] robotNames= new String[argv.length - 1]; + String[] robotNames= new String[argv.length - 2]; for (int i= 0; i < argv.length - 2; i++) robotNames[i]= argv[i+2]; Modified: lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?view=diff&rev=536925&r1=536924&r2=536925 == --- lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Thu May 10 09:29:51 2007 @@ -262,6 +262,26 @@ } } } + + public void testCrawlDelay() { +RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" }); +String delayRule1 = "User-agent: nutchbot" + CR + +"Crawl-delay: 10" + CR + +"User-agent: foobot" + CR + +"Crawl-delay: 20" + CR + +"User-agent: *" + CR + +"Disallow:/baz" + CR; +String delayRule2 = "User-agent: foobot" + CR + +"Crawl-delay: 20" + CR + +"User-agent: *" + CR + +
svn commit: r536909 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
Author: siren Date: Thu May 10 09:13:15 2007 New Revision: 536909 URL: http://svn.apache.org/viewvc?view=rev&rev=536909 Log: NUTCH-456 Parse msexcel plugin speedup contributed by Heiko Dietze Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536909&r1=536908&r2=536909 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu May 10 09:13:15 2007 @@ -10,7 +10,8 @@ 3. NUTCH-393 - Indexer should handle null documents returned by filters. (Eelco Lempsink via ab) - + 4. NUTCH-456 - Parse msexcel plugin speedup (Heiko Dietze via siren) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?view=diff&rev=536909&r1=536908&r2=536909 == --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java Thu May 10 09:13:15 2007 @@ -40,10 +40,10 @@ protected String extractText(InputStream input) throws Exception { -String resultText = ""; +StringBuilder resultText = new StringBuilder(); HSSFWorkbook wb = new HSSFWorkbook(input); if (wb == null) { - return resultText; + return resultText.toString(); } HSSFSheet sheet; @@ -69,25 +69,24 @@ for (int k=0; k
svn commit: r517015 - in /lucene/nutch/trunk: ./ lib/ src/java/org/apache/nutch/parse/ src/plugin/ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/ src/plugin/index-more/src/test/ src/plu
Author: siren Date: Sun Mar 11 14:18:23 2007 New Revision: 517015 URL: http://svn.apache.org/viewvc?view=rev&rev=517015 Log: merging 517012:516728 excluding changes made by dennis Added: lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar - copied unchanged from r516728, lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar - copied unchanged from r516728, lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt - copied unchanged from r516728, lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar - copied unchanged from r516728, lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html - copied unchanged from r516728, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html Removed: lucene/nutch/trunk/src/plugin/index-more/src/test/ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html lucene/nutch/trunk/src/plugin/parse-js/src/test/ Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/build.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=517015&r1=517014&r2=517015 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Mar 11 14:18:23 2007 @@ -158,18 +158,11 @@ 53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins framework to operate properly (Heiko Dietze via mattmann) -54. Change OutlinkExtractor to use Regular Expressions from JRE (siren) - -55. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan +54. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan Groschupf via kubes) -56. NUTCH-436 - Incorrect handling of relative paths when the embedded URL - path is empty (kubes) - -57. Replace oro with jre regular expressions in plugins, remove oro from - dependencies (siren) - -58. Remove redundant commons logging jars (siren) +55. NUTCH-436 - Incorrect handling of relative paths when the embedded URL +path is empty (kubes) Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?view=diff&rev=517015&r1=517014&r2=517015 == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Sun Mar 11 14:18:23 2007 @@ -148,20 +148,8 @@ - - - - - - - - - - - - - - + Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=517015&r1=517014&r2=517015 == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Sun Mar 11 14:18:23 2007 @@ -1,4 +1,4 @@ -/* +/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -14,21 +14,28 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.nutch.parse; import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.oro.text.regex.MatchResult; +import org.apache.oro.text.regex.Pattern; +import org.apache
svn commit: r516908 - /lucene/nutch/trunk/bin/nutch
Author: siren Date: Sun Mar 11 07:30:35 2007 New Revision: 516908 URL: http://svn.apache.org/viewvc?view=rev&rev=516908 Log: revert to previous version as requested by ab Modified: lucene/nutch/trunk/bin/nutch Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?view=diff&rev=516908&r1=516907&r2=516908 == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Sun Mar 11 07:30:35 2007 @@ -148,7 +148,7 @@ fi fi -if $cygwin -a "X${JAVA_LIBRARY_PATH}" != "X"; then +if [ $cygwin -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then JAVA_LIBRARY_PATH=`cygpath -p -w "$JAVA_LIBRARY_PATH"` fi
svn commit: r516888 - /lucene/nutch/trunk/bin/nutch
Author: siren Date: Sun Mar 11 04:12:23 2007 New Revision: 516888 URL: http://svn.apache.org/viewvc?view=rev&rev=516888 Log: fix bin/nutch: line 152: cygpath: command not found on linux (FC5), hope i am not breaking it for some other env Modified: lucene/nutch/trunk/bin/nutch Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?view=diff&rev=516888&r1=516887&r2=516888 == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Sun Mar 11 04:12:23 2007 @@ -148,7 +148,7 @@ fi fi -if [ $cygwin -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then +if $cygwin -a "X${JAVA_LIBRARY_PATH}" != "X"; then JAVA_LIBRARY_PATH=`cygpath -p -w "$JAVA_LIBRARY_PATH"` fi
svn commit: r516885 - /lucene/nutch/trunk/build.xml
Author: siren Date: Sun Mar 11 04:02:27 2007 New Revision: 516885 URL: http://svn.apache.org/viewvc?view=rev&rev=516885 Log: reduce the size of .job from 19+M down to 14+M Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?view=diff&rev=516885&r1=516884&r2=516885 == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Sun Mar 11 04:02:27 2007 @@ -148,8 +148,20 @@ - + + + + + + + + + + + + + +
svn commit: r516870 - in /lucene/nutch/trunk: CHANGES.txt lib/commons-logging-api-1.0.4.jar src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt src/plugin/ontology/lib/commons-logging-1.0.3.jar
Author: siren Date: Sun Mar 11 00:25:25 2007 New Revision: 516870 URL: http://svn.apache.org/viewvc?view=rev&rev=516870 Log: remove redundant commons-logging jars Removed: lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516870&r1=516869&r2=516870 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Mar 11 00:25:25 2007 @@ -168,6 +168,8 @@ 57. Replace oro with jre regular expressions in plugins, remove oro from dependencies (siren) + +58. Remove redundant commons logging jars (siren) Release 0.8 - 2006-07-25
svn commit: r516866 - in /lucene/nutch/trunk: CHANGES.txt lib/jakarta-oro-2.0.7.jar
Author: siren Date: Sun Mar 11 00:01:22 2007 New Revision: 516866 URL: http://svn.apache.org/viewvc?view=rev&rev=516866 Log: Remove oro as dependency Removed: lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516866&r1=516865&r2=516866 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Mar 11 00:01:22 2007 @@ -166,6 +166,9 @@ 56. NUTCH-436 - Incorrect handling of relative paths when the embedded URL path is empty (kubes) +57. Replace oro with jre regular expressions in plugins, remove oro from + dependencies (siren) + Release 0.8 - 2006-07-25 0. Totally new architecture, based on hadoop
svn commit: r516865 - /lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
Author: siren Date: Sat Mar 10 23:36:56 2007 New Revision: 516865 URL: http://svn.apache.org/viewvc?view=rev&rev=516865 Log: change urlnormalizer-regex to use regular expressions from jre Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?view=diff&rev=516865&r1=516864&r2=516865 == --- lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (original) +++ lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Sat Mar 10 23:36:56 2007 @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.net.urlnormalizer.regex; import java.net.URL; @@ -28,6 +27,7 @@ import java.util.List; import java.util.ArrayList; import java.util.Iterator; +import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -40,7 +40,6 @@ import javax.xml.parsers.*; import org.w3c.dom.*; -import org.apache.oro.text.regex.*; /** * Allows users to do regex substitutions on all/any URLs that are encountered, @@ -65,16 +64,14 @@ * string. */ private static class Rule { -public Perl5Pattern pattern; +public Pattern pattern; public String substitution; } - private HashMap scopedRules; + private HashMap> scopedRules; - private static final List EMPTY_RULES = Collections.EMPTY_LIST; - - private PatternMatcher matcher = new Perl5Matcher(); + private static final List EMPTY_RULES = Collections.EMPTY_LIST; /** * The default constructor which is called from UrlNormalizerFactory @@ -93,9 +90,9 @@ * configuration files for it. */ public RegexURLNormalizer(Configuration conf, String filename) - throws IOException, MalformedPatternException { + throws IOException { super(conf); -List rules = readConfigurationFile(filename); +List rules = readConfigurationFile(filename); if (rules != null) scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules); } @@ -106,9 +103,9 @@ // the default constructor was called if (this.scopedRules == null) { String filename = getConf().get("urlnormalizer.regex.file"); - scopedRules = new HashMap(); + scopedRules = new HashMap>(); URL url = getConf().getResource(filename); - List rules = null; + List rules = null; if (url == null) { LOG.warn("Can't load the default config file! " + filename); rules = EMPTY_RULES; @@ -126,7 +123,7 @@ // used in JUnit test. void setConfiguration(InputStream is, String scope) { -List rules = readConfiguration(is); +List rules = readConfiguration(is); scopedRules.put(scope, rules); LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules."); } @@ -136,7 +133,7 @@ * patterns. It accepts a string url as input and returns the altered string. */ public synchronized String regexNormalize(String urlString, String scope) { -List curRules = (List)scopedRules.get(scope); +List curRules = scopedRules.get(scope); if (curRules == null) { // try to populate String configFile = getConf().get("urlnormalizer.regex.file." + scope); @@ -147,7 +144,6 @@ LOG.warn("Can't load resource for config file: " + configFile); } else { try { -InputStream is = resource.openStream(); curRules = readConfiguration(resource.openStream()); scopedRules.put(scope, curRules); } catch (Exception e) { @@ -162,14 +158,11 @@ } if (curRules == EMPTY_RULES || curRules == null) { // use global rules - curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT); + curRules = scopedRules.get(URLNormalizers.SCOPE_DEFAULT); } -Iterator i = curRules.iterator(); -while (i.hasNext()) { - Rule r = (Rule) i.next(); - urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution( - r.substitution), urlString, Util.SUBSTITUTE_ALL); // a
svn commit: r516862 - /lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
Author: siren Date: Sat Mar 10 22:50:10 2007 New Revision: 516862 URL: http://svn.apache.org/viewvc?view=rev&rev=516862 Log: change urlnormalizer-basic to use regular expressions from jre Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?view=diff&rev=516862&r1=516861&r2=516862 == --- lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original) +++ lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Sat Mar 10 22:50:10 2007 @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -14,60 +14,62 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.net.urlnormalizer.basic; import java.net.URL; import java.net.MalformedURLException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -// Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -// Nutch imports import org.apache.nutch.net.URLNormalizer; -import org.apache.nutch.util.LogUtil; import org.apache.hadoop.conf.Configuration; -import org.apache.oro.text.regex.*; -/** Converts URLs to a normal form . */ +/** + * Converts URLs to a normal form. + * + * All substitutions will be done step by step, to ensure that certain + * constellations will be normalized, too. + * + * + * For example: "/aa/bb/../../cc/../foo.html will be normalized in the following + * manner: "/aa/bb/../../cc/../foo.html" "/aa/../cc/../foo.html" + * "/cc/../foo.html" "/foo.html". + * + * + * The normalization also takes care of leading "/../", which will be replaced + * by "/", because this is a rather a sign of bad webserver configuration than + * of a wanted link. For example, urls like "http://www.foo.com/../"; should + * return a http 404 error instead of redirecting to "http://www.foo.com";. + * + */ public class BasicURLNormalizer implements URLNormalizer { public static final Log LOG = LogFactory.getLog(BasicURLNormalizer.class); -private Perl5Compiler compiler = new Perl5Compiler(); -private ThreadLocal matchers = new ThreadLocal() { -protected synchronized Object initialValue() { - return new Perl5Matcher(); -} - }; -private Rule relativePathRule = null; -private Rule leadingRelativePathRule = null; +/** + * This pattern tries to find spots like "/xx/../" in the url, which could + * be replaced by "/" xx consists of chars, different then "/" (slash) and + * needs to have at least one char different from ".". + */ +private static final Pattern RELATIVE_PATH_PATTERN = Pattern.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)"); + +private static final String RELATIVE_PATH_SUBSTITUTION="/"; + +/** + * This pattern tries to find spots like leading "/../" in the url, which + * could be replaced by "/". + */ +private static final Pattern LEADING_RELATIVE_PATH_PATTERN = Pattern.compile("^(/\\.\\./)+"); + +private static final String LEADING_RELATIVE_PATH_SUBSTITUTION="/"; private Configuration conf; + public BasicURLNormalizer() { - try { -// this pattern tries to find spots like "/xx/../" in the url, which -// could be replaced by "/" xx consists of chars, different then "/" -// (slash) and needs to have at least one char different from "." -relativePathRule = new Rule(); -relativePathRule.pattern = (Perl5Pattern) - compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)", - Perl5Compiler.READ_ONLY_MASK); -relativePathRule.substitution = new Perl5Substitution("/"); - -// this pattern tries to find spots like leading "/../" in the url, -// which could be replaced by "/" -leadingRelativePathRule = new Rule(); -leadingRelativePathRule.pattern = (Perl5Pattern) - compiler.compile("^(/\\.\\./)+", Perl5Co
svn commit: r516788 - in /lucene/nutch/trunk/src/plugin: ./ parse-js/src/java/org/apache/nutch/ parse-js/src/java/org/apache/nutch/parse/js/ parse-js/src/test/ parse-js/src/test/org/ parse-js/src/test
Author: siren Date: Sat Mar 10 13:39:04 2007 New Revision: 516788 URL: http://svn.apache.org/viewvc?view=rev&rev=516788 Log: change parse-js to use regular expressions from jre, add junit test, moved package.html to proper place Added: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html - copied unchanged from r516662, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html lucene/nutch/trunk/src/plugin/parse-js/src/test/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java Removed: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html Modified: lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=516788&r1=516787&r2=516788 == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Sat Mar 10 13:39:04 2007 @@ -89,6 +89,7 @@ + Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=516788&r1=516787&r2=516788 == --- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sat Mar 10 13:39:04 2007 @@ -25,6 +25,8 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -40,13 +42,6 @@ import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; import org.apache.hadoop.conf.Configuration; -import org.apache.oro.text.regex.MatchResult; -import org.apache.oro.text.regex.Pattern; -import org.apache.oro.text.regex.PatternCompiler; -import org.apache.oro.text.regex.PatternMatcher; -import org.apache.oro.text.regex.PatternMatcherInput; -import org.apache.oro.text.regex.Perl5Compiler; -import org.apache.oro.text.regex.Perl5Matcher; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; @@ -54,11 +49,24 @@ import org.w3c.dom.NodeList; /** - * This class is a heuristic link extractor for JavaScript files and - * code snippets. The general idea of a two-pass regex matching comes from - * Heritrix. Parts of the code come from OutlinkExtractor.java - * by Stephan Strittmatter. - * + * + * This class is a heuristic link extractor for JavaScript files and code + * snippets. The general idea of a two-pass regex matching comes from Heritrix. + * Parts of the code come from OutlinkExtractor.java by Stephan Strittmatter. + * + * + * + * This Filter extracts javascript from following locations: + * + * from inside <script> tags + * from html 4.0 events like Window: onload,onunload, Form: + * onchange,onsubmit,onreset,onselect,onblur,onfocus Keyboard: + * onkeydown,onkeypress,onkeyup Mouse: + * onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup + * + * a href starting with literal "javascript" + * + * * @author Andrzej Bialecki <[EMAIL PROTECTED]> */ public class JSParseFilter implements HtmlParseFilter, Parser { @@ -97,6 +105,7 @@ Node lNode = n.getAttributes().getNamedItem("language"); if (lNode == null) lang = "javascript"; else lang = lNode.getNodeValue(); +//XXX lang is not checked?? StringBuffer script = new StringBuffer(); NodeList nn = n.getChildNodes(); if (nn.getLength() > 0) { @@ -104,9 +113,9 @@ if (i > 0) script.append('\n'); script.append(nn.item(i).getNodeValue()); } - // if (LOG.isInfoEnabled()) { - // LOG.info("script: language=" + lang + ", text: " + script.toString()); - // } + if (LOG.isDebugEnabled()) { +LOG.info("script: language=" + lang + ", text: " + scrip
svn commit: r516784 - /lucene/nutch/trunk/src/plugin/build.xml
Author: siren Date: Sat Mar 10 13:02:16 2007 New Revision: 516784 URL: http://svn.apache.org/viewvc?view=rev&rev=516784 Log: enable junit tests on index-more Modified: lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=516784&r1=516783&r2=516784 == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Sat Mar 10 13:02:16 2007 @@ -83,6 +83,7 @@ +
svn commit: r516778 - in /lucene/nutch/trunk/src/plugin/index-more/src: java/org/apache/nutch/indexer/more/ test/ test/org/ test/org/apache/ test/org/apache/nutch/ test/org/apache/nutch/indexer/ test/
Author: siren Date: Sat Mar 10 12:11:43 2007 New Revision: 516778 URL: http://svn.apache.org/viewvc?view=rev&rev=516778 Log: change MoreIndexingFilter to use regular expressions from jre Added: lucene/nutch/trunk/src/plugin/index-more/src/test/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?view=diff&rev=516778&r1=516777&r2=516778 == --- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sat Mar 10 12:11:43 2007 @@ -16,14 +16,6 @@ */ package org.apache.nutch.indexer.more; - -import org.apache.oro.text.regex.Perl5Compiler; -import org.apache.oro.text.regex.Perl5Matcher; -import org.apache.oro.text.regex.Perl5Pattern; -import org.apache.oro.text.regex.PatternMatcher; -import org.apache.oro.text.regex.MatchResult; -import org.apache.oro.text.regex.MalformedPatternException; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -56,6 +48,8 @@ import java.util.Date; import java.util.TimeZone; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.lang.time.DateUtils; @@ -244,21 +238,15 @@ // Patterns used to extract filename from possible non-standard // HTTP header "Content-Disposition". Typically it looks like: // Content-Disposition: inline; filename="foo.ppt" - private PatternMatcher matcher = new Perl5Matcher(); private Configuration conf; - static Perl5Pattern patterns[] = {null, null}; + static Pattern patterns[] = new Pattern[2]; static { -Perl5Compiler compiler = new Perl5Compiler(); -try { // order here is important patterns[0] = -(Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]"); +Pattern.compile("\\bfilename=['\"](.+)['\"]"); patterns[1] = -(Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b"); -} catch (MalformedPatternException e) { - // just ignore -} +Pattern.compile("\\bfilename=(\\S+)\\b"); } private Document resetTitle(Document doc, ParseData data, String url) { @@ -266,16 +254,28 @@ if (contentDisposition == null) return doc; -MatchResult result; -for (int i=0; ihttp://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java?view=auto&rev=516778 == --- lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java (added) +++ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java Sat Mar 10 12:11:43 2007 @@ -0,0 +1,36 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.nutch.indexer.more; + +import junit.framework.TestCase; + +public class MoreIndexingFilterTest extends TestCase { + + public void testGetFileNamePlain() { +assertMatches("attachment; filename=genome.jpeg;", "genome.jpeg"); +assertMatches("attachment; filename=\"genome.jpeg
svn commit: r516758 - in /lucene/nutch/trunk: CHANGES.txt lib/jakarta-oro-2.0.7.jar
Author: siren Date: Sat Mar 10 09:41:17 2007 New Revision: 516758 URL: http://svn.apache.org/viewvc?view=rev&rev=516758 Log: doh! putting oro back since it is still used outside core Added: lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar (with props) Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516758&r1=516757&r2=516758 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Mar 10 09:41:17 2007 @@ -158,8 +158,7 @@ 53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins framework to operate properly (Heiko Dietze via mattmann) -54. Change OutlinkExtractor to use Regular Expressions from JRE, get rid -of ORO dependency (siren) +54. Change OutlinkExtractor to use Regular Expressions from JRE (siren) Release 0.8 - 2006-07-25 Added: lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar?view=auto&rev=516758 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar -- svn:mime-type = application/octet-stream
svn commit: r516754 - in /lucene/nutch/trunk: CHANGES.txt lib/jakarta-oro-2.0.7.jar src/java/org/apache/nutch/parse/OutlinkExtractor.java
Author: siren Date: Sat Mar 10 09:30:04 2007 New Revision: 516754 URL: http://svn.apache.org/viewvc?view=rev&rev=516754 Log: Change OutlinkExtractor to use Regular Expressions from JRE, get rid of ORO dependency Removed: lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516754&r1=516753&r2=516754 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Mar 10 09:30:04 2007 @@ -158,6 +158,9 @@ 53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins framework to operate properly (Heiko Dietze via mattmann) +54. Change OutlinkExtractor to use Regular Expressions from JRE, get rid +of ORO dependency (siren) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=516754&r1=516753&r2=516754 == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Sat Mar 10 09:30:04 2007 @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -14,28 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.parse; import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.oro.text.regex.MatchResult; -import org.apache.oro.text.regex.Pattern; -import org.apache.oro.text.regex.PatternCompiler; -import org.apache.oro.text.regex.PatternMatcher; -import org.apache.oro.text.regex.PatternMatcherInput; -import org.apache.oro.text.regex.Perl5Compiler; -import org.apache.oro.text.regex.Perl5Matcher; /** - * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s - * / URLs from plain text using Regular Expressions. + * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s / URLs from + * plain text using Regular Expressions. * * @see http://wiki.java.net/bin/view/Javapedia/RegularExpressions";>Comparison @@ -44,12 +37,14 @@ * * * @author Stephan Strittmatter - http://www.sybit.de - * @version 1.0 + * * @since 0.7 */ public class OutlinkExtractor { private static final Log LOG = LogFactory.getLog(OutlinkExtractor.class); + private static final Outlink[] NO_LINKS = new Outlink[0]; + /** * Regex pattern to get URLs within a plain text. * @@ -57,190 +52,63 @@ * href="http://www.truerwords.net/articles/ut/urlactivation.html";>http://www.truerwords.net/articles/ut/urlactivation.html * */ - private static final String URL_PATTERN = - "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; + private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; + + static final Pattern urlPattern = Pattern.compile(URL_PATTERN); /** - * Extracts Outlink from given plain text. - * Applying this method to non-plain-text can result in extremely lengthy - * runtimes for parasitic cases (postscript is a known example). - * @param plainText the plain text from wich URLs should be extracted. + * Extracts outlinks from a plain text. + * + * @param plainText * - * @return Array of Outlinks within found in plainText + * @return Array of Outlink s within found in plainText */ - public static Outlink[] getOutlinks(final String plainText, Configuration conf) { -return OutlinkExtractor.getOutlinks(plainText, "", conf); + public static Outlink[] getOutlinks(final String plainText, Configuration conf){ +return getOutlinks(plainText, null, conf); } + /** - * Extracts Outlink from given plain text and adds anchor - * to the extracted Outlinks - * - * @param plainText the plain text from wich URLs should be ex
svn commit: r499878 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/Indexer.java src/java/org/apache/nutch/segment/SegmentMerger.java src/java/org/apache/nutch/segment/SegmentR
Author: siren Date: Thu Jan 25 10:11:59 2007 New Revision: 499878 URL: http://svn.apache.org/viewvc?view=rev&rev=499878 Log: NUTCH-433 Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=499878&r1=499877&r2=499878 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Jan 25 10:11:59 2007 @@ -139,6 +139,9 @@ 45. NUTCH-68 - Add a tool to generate arbitrary fetchlists. (ab) +46. NUTCH-433 - java.io.EOFException in newer nightlies in mergesegs +or indexing from hadoop.io.DataOutputBuffer (siren) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=499878&r1=499877&r2=499878 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu Jan 25 10:11:59 2007 @@ -24,7 +24,6 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.*; -import org.apache.nutch.fetcher.Fetcher; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; @@ -51,41 +50,12 @@ import org.apache.nutch.metadata.Nutch; /** Create indexes for segments. */ -public class Indexer extends ToolBase implements Reducer { +public class Indexer extends ToolBase implements Reducer, Mapper { public static final String DONE_NAME = "index.done"; public static final Log LOG = LogFactory.getLog(Indexer.class); - /** Wraps inputs in an [EMAIL PROTECTED] ObjectWritable}, to permit merging different - * types in reduce. */ - public static class InputFormat extends SequenceFileInputFormat { -public RecordReader getRecordReader(FileSystem fs, FileSplit split, -JobConf job, Reporter reporter) - throws IOException { - - reporter.setStatus(split.toString()); - - return new SequenceFileRecordReader(job, split) { - public synchronized boolean next(Writable key, Writable value) -throws IOException { -ObjectWritable wrapper = (ObjectWritable)value; -try { - wrapper.set(getValueClass().newInstance()); -} catch (Exception e) { - throw new IOException(e.toString()); -} -return super.next(key, (Writable)wrapper.get()); - } - - // override the default - we want ObjectWritable-s here - public Writable createValue() { -return new ObjectWritable(); - } -}; -} - } - /** Unwrap Lucene Documents created by reduce and add them to an index. */ public static class OutputFormat extends org.apache.hadoop.mapred.OutputFormatBase { @@ -290,12 +260,9 @@ job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME)); +job.setInputFormat(SequenceFileInputFormat.class); -job.setInputFormat(InputFormat.class); -//job.setInputKeyClass(Text.class); -//job.setInputValueClass(ObjectWritable.class); - -//job.setCombinerClass(Indexer.class); +job.setMapperClass(Indexer.class); job.setReducerClass(Indexer.class); job.setOutputPath(indexDir); @@ -332,6 +299,11 @@ LOG.fatal("Indexer: " + StringUtils.stringifyException(e)); return -1; } + } + + public void map(WritableComparable key, Writable value, + OutputCollector output, Reporter reporter) throws IOException { +output.collect(key, new ObjectWritable(value)); } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?view=diff&rev=499878&r1=499877&r2=499878 == --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Thu Jan 25 10:11:59 2007 @@ -32,9 +32,7 @@ import org.apache.hadoop.util.Progressable; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Generator; -import org.apache.nutch.fetcher.Fetcher; import org.apache.nutch.metadata.MetaWrapper; -import org.apache.nutch.metadat
svn commit: r497867 - /lucene/nutch/trunk/conf/
Author: siren Date: Fri Jan 19 08:37:35 2007 New Revision: 497867 URL: http://svn.apache.org/viewvc?view=rev&rev=497867 Log: NUTCH-400 Modified: lucene/nutch/trunk/conf/automaton-urlfilter.txt.template lucene/nutch/trunk/conf/common-terms.utf8 lucene/nutch/trunk/conf/configuration.xsl lucene/nutch/trunk/conf/crawl-tool.xml lucene/nutch/trunk/conf/crawl-urlfilter.txt.template lucene/nutch/trunk/conf/mime-types.xml lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/conf/regex-normalize.xml.template lucene/nutch/trunk/conf/regex-urlfilter.txt.template lucene/nutch/trunk/conf/subcollections.xml.template Modified: lucene/nutch/trunk/conf/automaton-urlfilter.txt.template URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/automaton-urlfilter.txt.template?view=diff&rev=497867&r1=497866&r2=497867 == --- lucene/nutch/trunk/conf/automaton-urlfilter.txt.template (original) +++ lucene/nutch/trunk/conf/automaton-urlfilter.txt.template Fri Jan 19 08:37:35 2007 @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # The default url filter. # Better for whole-internet crawling. Modified: lucene/nutch/trunk/conf/common-terms.utf8 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/common-terms.utf8?view=diff&rev=497867&r1=497866&r2=497867 == --- lucene/nutch/trunk/conf/common-terms.utf8 (original) +++ lucene/nutch/trunk/conf/common-terms.utf8 Fri Jan 19 08:37:35 2007 @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Common terms and phrases which will be indexed in n-grams # in order to optimize search. content:a Modified: lucene/nutch/trunk/conf/configuration.xsl URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/configuration.xsl?view=diff&rev=497867&r1=497866&r2=497867 == --- lucene/nutch/trunk/conf/configuration.xsl (original) +++ lucene/nutch/trunk/conf/configuration.xsl Fri Jan 19 08:37:35 2007 @@ -1,4 +1,20 @@ + http://www.w3.org/1999/XSL/Transform"; version="1.0"> Modified: lucene/nutch/trunk/conf/crawl-tool.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/crawl-tool.xml?view=diff&rev=497867&r1=497866&r2=497867 == --- lucene/nutch/trunk/conf/crawl-tool.xml (original) +++ lucene/nutch/trunk/conf/crawl-tool.xml Fri Jan 19 08:37:35 2007 @@ -1,4 +1,20 @@ + Modified: lucene/nutch/trunk/conf/crawl-urlfilter.txt.template URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/crawl-urlfilter.txt.template?view=diff&rev=497867&r1=497866&r2=497867 == --- lucene/nutch/trunk/conf/crawl-urlfilter.txt.template (original) +++ lucene/nutch/trunk/conf/crawl-urlfilter.txt.template Fri Jan 19 08:37:35 2007 @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (th
svn commit: r497859 [3/3] - in /lucene/nutch/trunk: ./ src/plugin/ src/plugin/analysis-de/ src/plugin/analysis-fr/ src/plugin/clustering-carrot2/ src/plugin/clustering-carrot2/src/java/org/apache/nutc
Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/cached.jsp?view=diff&rev=497859&r1=497858&r2=497859 == --- lucene/nutch/trunk/src/web/jsp/cached.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/cached.jsp Fri Jan 19 08:17:32 2007 @@ -1,3 +1,19 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> <%@ page session="false" contentType="text/html; charset=UTF-8" Modified: lucene/nutch/trunk/src/web/jsp/cluster.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/cluster.jsp?view=diff&rev=497859&r1=497858&r2=497859 == --- lucene/nutch/trunk/src/web/jsp/cluster.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/cluster.jsp Fri Jan 19 08:17:32 2007 @@ -1,3 +1,19 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> <% // @author Dawid Weiss Modified: lucene/nutch/trunk/src/web/jsp/explain.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/explain.jsp?view=diff&rev=497859&r1=497858&r2=497859 == --- lucene/nutch/trunk/src/web/jsp/explain.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/explain.jsp Fri Jan 19 08:17:32 2007 @@ -1,3 +1,19 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> <%@ page session="false" contentType="text/html; charset=UTF-8" Modified: lucene/nutch/trunk/src/web/jsp/index.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/index.jsp?view=diff&rev=497859&r1=497858&r2=497859 == --- lucene/nutch/trunk/src/web/jsp/index.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/index.jsp Fri Jan 19 08:17:32 2007 @@ -1,3 +1,19 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> <%@ page session="f
svn commit: r497540 - in /lucene/nutch/trunk/contrib/web2/plugins: web-query-propose-ontology/src/web/web-query-propose-ontology/ web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/ web
Author: siren Date: Thu Jan 18 11:18:52 2007 New Revision: 497540 URL: http://svn.apache.org/viewvc?view=rev&rev=497540 Log: NUTCH-400 Modified: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/src/web/web-query-propose-ontology/propose.jsp lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp lucene/nutch/trunk/contrib/web2/plugins/web-resources/src/web/footer.jsp lucene/nutch/trunk/contrib/web2/plugins/web-subcollection/src/web/web-subcollection/select.jsp Modified: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/src/web/web-query-propose-ontology/propose.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/src/web/web-query-propose-ontology/propose.jsp?view=diff&rev=497540&r1=497539&r2=497540 == --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/src/web/web-query-propose-ontology/propose.jsp (original) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/src/web/web-query-propose-ontology/propose.jsp Thu Jan 18 11:18:52 2007 @@ -1,3 +1,19 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> <%@ page session="false"%> <%@ taglib prefix="tiles" uri="http://jakarta.apache.org/struts/tags-tiles"%> <%@ taglib prefix="c" uri="http://java.sun.com/jstl/core"%> Modified: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp?view=diff&rev=497540&r1=497539&r2=497540 == --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp (original) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp Thu Jan 18 11:18:52 2007 @@ -1,3 +1,19 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> <%@ page session="false"%> <%@ taglib prefix="tiles" uri="http://jakarta.apache.org/struts/tags-tiles"%> <%@ taglib prefix="c" uri="http://java.sun.com/jstl/core"%> Modified: lucene/nutch/trunk/contrib/web2/plugins/web-resources/src/web/footer.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-resources/src/web/footer.jsp?view=diff&rev=497540&r1=497539&r2=497540 == --- lucene/nutch/trunk/contrib/web2/plugins/web-resources/src/web/footer.jsp (original) +++ lucene/nutch/trunk/contrib/web2/plugins/web-resources/src/web/footer.jsp Thu Jan 18 11:18:52 2007 @@ -1,3 +1,19 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not u
svn commit: r496358 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Generator.java
Author: siren Date: Mon Jan 15 07:02:37 2007 New Revision: 496358 URL: http://svn.apache.org/viewvc?view=rev&rev=496358 Log: fix NUTCH-430 Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=496358&r1=496357&r2=496358 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Jan 15 07:02:37 2007 @@ -133,6 +133,9 @@ 43. NUTCH-428 - NullPointerException thrown when agent name is not configured properly. Changed to throw RuntimeException instead. +(siren) + +44. NUTCH-430 - Integer overflow in HashComparator.compare (siren) Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=496358&r1=496357&r2=496358 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon Jan 15 07:02:37 2007 @@ -264,39 +264,33 @@ output.collect(entry.url, entry.datum); } } - + /** Sort fetch lists by hash of URL. */ public static class HashComparator extends WritableComparator { -public HashComparator() { super(Text.class); } +public HashComparator() { + super(Text.class); +} public int compare(WritableComparable a, WritableComparable b) { - Text url1 = (Text)a; - Text url2 = (Text)b; + Text url1 = (Text) a; + Text url2 = (Text) b; int hash1 = hash(url1.getBytes(), 0, url1.getLength()); int hash2 = hash(url2.getBytes(), 0, url2.getLength()); - if (hash1 != hash2) { -return hash1 - hash2; - } - return compareBytes(url1.getBytes(), 0, url1.getLength(), - url2.getBytes(), 0, url2.getLength()); + return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1)); } - public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { int hash1 = hash(b1, s1, l1); int hash2 = hash(b2, s2, l2); - if (hash1 != hash2) { -return hash1 - hash2; - } - return compareBytes(b1, s1, l1, b2, s2, l2); + return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1)); } private static int hash(byte[] bytes, int start, int length) { int hash = 1; // make later bytes more significant in hash code, so that sorting by // hashcode correlates less with by-host ordering. - for (int i = length-1; i >= 0; i--) -hash = (31 * hash) + (int)bytes[start+i]; + for (int i = length - 1; i >= 0; i--) +hash = (31 * hash) + (int) bytes[start + i]; return hash; } }
svn commit: r495762 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Author: siren Date: Fri Jan 12 14:12:15 2007 New Revision: 495762 URL: http://svn.apache.org/viewvc?view=rev&rev=495762 Log: NUTCH-428 Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=495762&r1=495761&r2=495762 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Jan 12 14:12:15 2007 @@ -131,6 +131,9 @@ 42. NUTCH-420 - Fix a bug in DeleteDuplicates where results depended on the order in which IndexDoc-s are processed. (Dogacan Guney via ab) +43. NUTCH-428 - NullPointerException thrown when agent name is not +configured properly. Changed to throw RuntimeException instead. + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=495762&r1=495761&r2=495762 == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Fri Jan 12 14:12:15 2007 @@ -223,6 +223,9 @@ // Grab the agent names we advertise to robots files. // String agentName = conf.get("http.agent.name"); +if (null == agentName) { + throw new RuntimeException("Agent name not configured!"); +} String agentNames = conf.get("http.robots.agents"); StringTokenizer tok = new StringTokenizer(agentNames, ","); ArrayList agents = new ArrayList();
svn commit: r495716 - in /lucene/nutch/trunk: site/nightly.html site/nightly.pdf src/site/src/documentation/content/xdocs/nightly.xml
Author: siren Date: Fri Jan 12 12:18:27 2007 New Revision: 495716 URL: http://svn.apache.org/viewvc?view=rev&rev=495716 Log: fix url to nightly builds Modified: lucene/nutch/trunk/site/nightly.html lucene/nutch/trunk/site/nightly.pdf lucene/nutch/trunk/src/site/src/documentation/content/xdocs/nightly.xml Modified: lucene/nutch/trunk/site/nightly.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/nightly.html?view=diff&rev=495716&r1=495715&r2=495716 == --- lucene/nutch/trunk/site/nightly.html (original) +++ lucene/nutch/trunk/site/nightly.html Fri Jan 12 12:18:27 2007 @@ -162,7 +162,7 @@ -http://people.apache.org/dist/lucene/nutch/nightly/";>Nutch nightly builds (0.9-dev) +http://people.apache.org/builds/lucene/nutch/nightly/";>Nutch nightly builds (0.9-dev) Modified: lucene/nutch/trunk/site/nightly.pdf URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/nightly.pdf?view=diff&rev=495716&r1=495715&r2=495716 == --- lucene/nutch/trunk/site/nightly.pdf (original) +++ lucene/nutch/trunk/site/nightly.pdf Fri Jan 12 12:18:27 2007 @@ -55,7 +55,7 @@ /Rect [ 90.0 509.0 188.676 497.0 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] -/A << /URI (http://people.apache.org/dist/lucene/nutch/nightly/) +/A << /URI (http://people.apache.org/builds/lucene/nutch/nightly/) /S /URI >> /H /I >> @@ -106,9 +106,9 @@ xref 0 15 00 65535 f -002182 0 n -002240 0 n -002290 0 n +002184 0 n +002242 0 n +002292 0 n 15 0 n 71 0 n 000959 0 n @@ -116,10 +116,10 @@ 001117 0 n 001370 0 n 001537 0 n -001735 0 n -001848 0 n -001958 0 n -002066 0 n +001737 0 n +001850 0 n +001960 0 n +002068 0 n trailer << /Size 15 @@ -127,5 +127,5 @@ /Info 4 0 R >> startxref -2402 +2404 %%EOF Modified: lucene/nutch/trunk/src/site/src/documentation/content/xdocs/nightly.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/site/src/documentation/content/xdocs/nightly.xml?view=diff&rev=495716&r1=495715&r2=495716 == --- lucene/nutch/trunk/src/site/src/documentation/content/xdocs/nightly.xml (original) +++ lucene/nutch/trunk/src/site/src/documentation/content/xdocs/nightly.xml Fri Jan 12 12:18:27 2007 @@ -21,7 +21,7 @@ To report bugs see issue tracking -http://people.apache.org/dist/lucene/nutch/nightly/";>Nutch nightly builds (0.9-dev) +http://people.apache.org/builds/lucene/nutch/nightly/";>Nutch nightly builds (0.9-dev)
svn commit: r493556 - /lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
Author: siren Date: Sat Jan 6 12:04:03 2007 New Revision: 493556 URL: http://svn.apache.org/viewvc?view=rev&rev=493556 Log: fix formatting Modified: lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java?view=diff&rev=493556&r1=493555&r2=493556 == --- lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Sat Jan 6 12:04:03 2007 @@ -33,53 +33,49 @@ private URLFilter[] filters; public URLFilters(Configuration conf) { - String order = conf.get(URLFILTER_ORDER); - this.filters = (URLFilter[]) conf.getObject(URLFilter.class.getName()); - - if (this.filters == null) { -String[] orderedFilters = null; -if (order != null && !order.trim().equals("")) { -orderedFilters = order.split("\\s+"); -} +String order = conf.get(URLFILTER_ORDER); +this.filters = (URLFilter[]) conf.getObject(URLFilter.class.getName()); + +if (this.filters == null) { + String[] orderedFilters = null; + if (order != null && !order.trim().equals("")) { +orderedFilters = order.split("\\s+"); + } -try { -ExtensionPoint point = PluginRepository.get(conf) -.getExtensionPoint(URLFilter.X_POINT_ID); -if (point == null) -throw new RuntimeException(URLFilter.X_POINT_ID -+ " not found."); -Extension[] extensions = point.getExtensions(); -HashMap filterMap = new HashMap(); -for (int i = 0; i < extensions.length; i++) { -Extension extension = extensions[i]; -URLFilter filter = (URLFilter) extension -.getExtensionInstance(); -if (!filterMap.containsKey(filter.getClass().getName())) { -filterMap.put(filter.getClass().getName(), filter); -} -} -if (orderedFilters == null) { -conf.setObject(URLFilter.class.getName(), filterMap -.values().toArray(new URLFilter[0])); -} else { -ArrayList filters = new ArrayList(); -for (int i = 0; i < orderedFilters.length; i++) { - URLFilter filter = (URLFilter) filterMap -.get(orderedFilters[i]); - if(filter != null){ -filters.add(filter); - } -} -conf.setObject(URLFilter.class.getName(), -filters.toArray(new URLFilter[filters.size()])); -} -} catch (PluginRuntimeException e) { -throw new RuntimeException(e); + try { +ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( +URLFilter.X_POINT_ID); +if (point == null) + throw new RuntimeException(URLFilter.X_POINT_ID + " not found."); +Extension[] extensions = point.getExtensions(); +HashMap filterMap = new HashMap(); +for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + URLFilter filter = (URLFilter) extension.getExtensionInstance(); + if (!filterMap.containsKey(filter.getClass().getName())) { +filterMap.put(filter.getClass().getName(), filter); + } +} +if (orderedFilters == null) { + conf.setObject(URLFilter.class.getName(), filterMap.values().toArray( + new URLFilter[0])); +} else { + ArrayList filters = new ArrayList(); + for (int i = 0; i < orderedFilters.length; i++) { +URLFilter filter = (URLFilter) filterMap.get(orderedFilters[i]); +if (filter != null) { + filters.add(filter); } -this.filters = (URLFilter[]) conf.getObject(URLFilter.class -.getName()); + } + conf.setObject(URLFilter.class.getName(), filters + .toArray(new URLFilter[filters.size()])); } - } + } catch (PluginRuntimeException e) { +throw new RuntimeException(e); + } + this.filters = (URLFilter[]) conf.getObject(URLFilter.class.getName()); +} + } /** Run all defined filters. Assume logical AND. */ public String filter(String urlString) throws URLFilterException {
svn commit: r493555 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
Author: siren Date: Sat Jan 6 12:00:48 2007 New Revision: 493555 URL: http://svn.apache.org/viewvc?view=rev&rev=493555 Log: fix formatting Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diff&rev=493555&r1=493554&r2=493555 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Sat Jan 6 12:00:48 2007 @@ -43,56 +43,69 @@ private IndexingFilter[] indexingFilters; public IndexingFilters(Configuration conf) { - /* Get indexingfilter.order property */ - String order = conf.get(INDEXINGFILTER_ORDER); - this.indexingFilters =(IndexingFilter[]) conf.getObject(IndexingFilter.class.getName()); - if (this.indexingFilters == null) { - /* If ordered filters are required, prepare array of filters based on property */ - String[] orderedFilters = null; - if (order != null && !order.trim().equals("")) { - orderedFilters = order.split("\\s+"); +/* Get indexingfilter.order property */ +String order = conf.get(INDEXINGFILTER_ORDER); +this.indexingFilters = (IndexingFilter[]) conf +.getObject(IndexingFilter.class.getName()); +if (this.indexingFilters == null) { + /* + * If ordered filters are required, prepare array of filters based on + * property + */ + String[] orderedFilters = null; + if (order != null && !order.trim().equals("")) { +orderedFilters = order.split("\\s+"); + } + try { +ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( +IndexingFilter.X_POINT_ID); +if (point == null) + throw new RuntimeException(IndexingFilter.X_POINT_ID + " not found."); +Extension[] extensions = point.getExtensions(); +HashMap filterMap = new HashMap(); +for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + IndexingFilter filter = (IndexingFilter) extension + .getExtensionInstance(); + if (LOG.isInfoEnabled()) { +LOG.info("Adding " + filter.getClass().getName()); } -try { -ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(IndexingFilter.X_POINT_ID); -if (point == null) -throw new RuntimeException(IndexingFilter.X_POINT_ID + " not found."); -Extension[] extensions = point.getExtensions(); -HashMap filterMap = new HashMap(); -for (int i = 0; i < extensions.length; i++) { -Extension extension = extensions[i]; -IndexingFilter filter = (IndexingFilter) extension.getExtensionInstance(); -if (LOG.isInfoEnabled()) { - LOG.info("Adding " + filter.getClass().getName()); -} -if (!filterMap.containsKey(filter.getClass().getName())) { -filterMap.put(filter.getClass().getName(), filter); -} -} -/* If no ordered filters required, just get the filters in an indeterminate order */ -if (orderedFilters == null) { -conf.setObject(IndexingFilter.class.getName(), (IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0])); -/* Otherwise run the filters in the required order */ -} else { -ArrayList filters = new ArrayList(); -for (int i = 0; i < orderedFilters.length; i++) { -IndexingFilter filter = (IndexingFilter) filterMap -.get(orderedFilters[i]); -if (filter != null) { - filters.add(filter); -} -} -conf.setObject(IndexingFilter.class.getName(), filters.toArray(new IndexingFilter[filters.size()])); -} -} catch (PluginRuntimeException e) { -throw new RuntimeException(e); + if (!filterMap.containsKey(filter.getClass().getName())) { +filterMap.put(filter.getClass().getName(), filter); + } +} +/* + * If no ordered filters required, just get the filters in an + * indeterminate order + */ +if (orderedFilters == null) { + conf.setObject(IndexingFilt
svn commit: r493548 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/indexer/IndexingFilters.java src/test/org/apache/nutch/indexer/TestIndexingFilters.java
Author: siren Date: Sat Jan 6 11:49:49 2007 New Revision: 493548 URL: http://svn.apache.org/viewvc?view=rev&rev=493548 Log: fix NUTCH-421 Added: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=493548&r1=493547&r2=493548 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Jan 6 11:49:49 2007 @@ -119,6 +119,9 @@ 38. NUTCH-325 - UrlFilters.java throws NPE in case urlfilter.order contains Filters that are not in plugin.includes (Stefan Groschupf, siren) + +39. NUTCH-421 - Allow predeterminate running order of indexing filters + (Alan Tanaman, siren) Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=493548&r1=493547&r2=493548 == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Sat Jan 6 11:49:49 2007 @@ -536,6 +536,24 @@ + + + + indexingfilter.order + + The order by which index filters are applied. + If empty, all available index filters (as dictated by properties + plugin-includes and plugin-excludes above) are loaded and applied in system + defined order. If not empty, only named filters are loaded and applied + in given order. For example, if this property has value: + org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter + then BasicIndexingFilter is applied first, and MoreIndexingFilter second. + + Filter ordering might have impact on result if one filter depends on output of + another filter. + + + Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diff&rev=493548&r1=493547&r2=493548 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Sat Jan 6 11:49:49 2007 @@ -17,6 +17,7 @@ package org.apache.nutch.indexer; +import java.util.ArrayList; import java.util.HashMap; // Commons Logging imports @@ -35,13 +36,22 @@ /** Creates and caches [EMAIL PROTECTED] IndexingFilter} implementing plugins.*/ public class IndexingFilters { + public static final String INDEXINGFILTER_ORDER = "indexingfilter.order"; + public final static Log LOG = LogFactory.getLog(IndexingFilters.class); private IndexingFilter[] indexingFilters; public IndexingFilters(Configuration conf) { + /* Get indexingfilter.order property */ + String order = conf.get(INDEXINGFILTER_ORDER); this.indexingFilters =(IndexingFilter[]) conf.getObject(IndexingFilter.class.getName()); if (this.indexingFilters == null) { + /* If ordered filters are required, prepare array of filters based on property */ + String[] orderedFilters = null; + if (order != null && !order.trim().equals("")) { + orderedFilters = order.split("\\s+"); + } try { ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(IndexingFilter.X_POINT_ID); if (point == null) @@ -58,7 +68,21 @@ filterMap.put(filter.getClass().getName(), filter); } } -conf.setObject(IndexingFilter.class.getName(), (IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0])); +/* If no ordered filters required, just get the filters in an indeterminate order */ +if (orderedFilters == null) { +conf.setObject(IndexingFilter.class.getName(), (IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0])); +/* Otherwise run the filters in the required order */ +} else { +ArrayList filters = new ArrayList(); +for (int i = 0; i < orderedFilters.length; i++) { +IndexingFilter filter = (IndexingFilter) filterMap +.get(orderedFilters[i]); +if (filter != null) { + filters.add(filter); +} +} +c
svn commit: r493438 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/net/URLFilters.java src/test/org/apache/nutch/net/TestURLFilters.java
Author: siren Date: Sat Jan 6 01:39:20 2007 New Revision: 493438 URL: http://svn.apache.org/viewvc?view=rev&rev=493438 Log: Fix NUTCH-325 Added: lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=493438&r1=493437&r2=493438 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Jan 6 01:39:20 2007 @@ -117,6 +117,9 @@ 37. NUTCH-425, NUTCH-426 - Fix anchors pollution. Continue after skipping bad URLs. (Michael Stack via ab) +38. NUTCH-325 - UrlFilters.java throws NPE in case urlfilter.order contains +Filters that are not in plugin.includes (Stefan Groschupf, siren) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java?view=diff&rev=493438&r1=493437&r2=493438 == --- lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Sat Jan 6 01:39:20 2007 @@ -17,6 +17,7 @@ package org.apache.nutch.net; +import java.util.ArrayList; import java.util.HashMap; import org.apache.nutch.plugin.Extension; @@ -28,10 +29,11 @@ /** Creates and caches [EMAIL PROTECTED] URLFilter} implementing plugins.*/ public class URLFilters { + public static final String URLFILTER_ORDER = "urlfilter.order"; private URLFilter[] filters; public URLFilters(Configuration conf) { - String order = conf.get("urlfilter.order"); + String order = conf.get(URLFILTER_ORDER); this.filters = (URLFilter[]) conf.getObject(URLFilter.class.getName()); if (this.filters == null) { @@ -60,12 +62,16 @@ conf.setObject(URLFilter.class.getName(), filterMap .values().toArray(new URLFilter[0])); } else { -URLFilter[] filter = new URLFilter[orderedFilters.length]; +ArrayList filters = new ArrayList(); for (int i = 0; i < orderedFilters.length; i++) { -filter[i] = (URLFilter) filterMap + URLFilter filter = (URLFilter) filterMap .get(orderedFilters[i]); + if(filter != null){ +filters.add(filter); + } } -conf.setObject(URLFilter.class.getName(), filter); +conf.setObject(URLFilter.class.getName(), +filters.toArray(new URLFilter[filters.size()])); } } catch (PluginRuntimeException e) { throw new RuntimeException(e); Added: lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java?view=auto&rev=493438 == --- lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java Sat Jan 6 01:39:20 2007 @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestURLFilters extends TestCase { + + /** + * Testcase for NUTCH-325. + * @throws URLFilterException + */ + public void testNonExistingUrlFilter() throws URLFilterException { +Configuration conf = NutchConfiguration.create(); +String class1 = "NonExistingFilter"; +
svn commit: r493159 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Author: siren Date: Fri Jan 5 11:43:48 2007 New Revision: 493159 URL: http://svn.apache.org/viewvc?view=rev&rev=493159 Log: reuse existing code in DecreasingFloatComparator Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=493159&r1=493158&r2=493159 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Jan 5 11:43:48 2007 @@ -248,20 +248,15 @@ } - public static class DecreasingFloatComparator extends WritableComparator { - -public DecreasingFloatComparator() { - super(FloatWritable.class); -} + public static class DecreasingFloatComparator extends FloatWritable.Comparator { /** Compares two FloatWritables decreasing. */ -public int compare(WritableComparable o1, WritableComparable o2) { - float thisValue = ((FloatWritable) o1).get(); - float thatValue = ((FloatWritable) o2).get(); - return (thisValue
svn commit: r493114 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Author: siren Date: Fri Jan 5 10:17:01 2007 New Revision: 493114 URL: http://svn.apache.org/viewvc?view=rev&rev=493114 Log: minor mod to javadoc Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=493114&r1=493113&r2=493114 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Jan 5 10:17:01 2007 @@ -353,7 +353,10 @@ .currentTimeMillis(), true, false); } - /** Generate fetchlists in a segment. */ + /** + * Generate fetchlists in a segment. + * @return Path to generated segment or null if no entries were selected. + * */ public Path generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean force)
svn commit: r485076 - in /lucene/nutch/trunk/src: java/org/apache/nutch/metadata/SpellCheckedMetadata.java test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
Author: siren Date: Sat Dec 9 14:27:07 2006 New Revision: 485076 URL: http://svn.apache.org/viewvc?view=rev&rev=485076 Log: Optimize SpellCheckedMetadata further by taking into account the fact that it is used only for http-headers. I am starting to believe that spellchecking should just be an utility method used by http protocol plugins. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java?view=diff&rev=485076&r1=485075&r2=485076 == --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java Sat Dec 9 14:27:07 2006 @@ -25,10 +25,9 @@ /** * A decorator to Metadata that adds spellchecking capabilities to property - * names. - * - * All the static String fields declared by this class are used as reference - * names for syntax correction on meta-data naming. + * names. Currently used spelling vocabulary contains just the httpheaders from + * [EMAIL PROTECTED] HttpHeaders} class. + * */ public class SpellCheckedMetadata extends Metadata { @@ -49,18 +48,23 @@ */ private static String[] normalized = null; - // Uses self introspection to fill the metanames index and the - // metanames list. static { -for (Field field : SpellCheckedMetadata.class.getFields()) { - int mods = field.getModifiers(); - if (Modifier.isFinal(mods) && Modifier.isPublic(mods) - && Modifier.isStatic(mods) && field.getType().equals(String.class)) { -try { - String val = (String) field.get(null); - NAMES_IDX.put(normalize(val), val); -} catch (Exception e) { - // Simply ignore... + +// Uses following array to fill the metanames index and the +// metanames list. +Class[] spellthese = {HttpHeaders.class}; + +for (Class spellCheckedNames : spellthese) { + for (Field field : spellCheckedNames.getFields()) { +int mods = field.getModifiers(); +if (Modifier.isFinal(mods) && Modifier.isPublic(mods) +&& Modifier.isStatic(mods) && field.getType().equals(String.class)) { + try { +String val = (String) field.get(null); +NAMES_IDX.put(normalize(val), val); + } catch (Exception e) { +// Simply ignore... + } } } } @@ -125,8 +129,7 @@ @Override public void add(final String name, final String value) { -String normalized = getNormalizedName(name); -super.add(normalized, value); +super.add(getNormalizedName(name), value); } @Override Modified: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java?view=diff&rev=485076&r1=485075&r2=485076 == --- lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java Sat Dec 9 14:27:07 2006 @@ -36,6 +36,8 @@ */ public class TestSpellCheckedMetadata extends TestCase { + private static final int NUM_ITERATIONS = 1; + public TestSpellCheckedMetadata(String testName) { super(testName); } @@ -63,7 +65,7 @@ assertEquals("Content-Type", SpellCheckedMetadata .getNormalizedName("contntype")); } - + /** Test for the add(String, String) method. */ public void testAdd() { String[] values = null; @@ -237,18 +239,35 @@ assertEquals(0, result.size()); meta.add("name-one", "value-1.1"); result = writeRead(meta); +meta.add("Contenttype", "text/html"); assertEquals(1, result.size()); assertEquals(1, result.getValues("name-one").length); assertEquals("value-1.1", result.get("name-one")); meta.add("name-two", "value-2.1"); meta.add("name-two", "value-2.2"); result = writeRead(meta); -assertEquals(2, result.size()); +assertEquals(3, result.size()); assertEquals(1, result.getValues("name-one").length); assertEquals("value-1.1", result.getValues("name-one")[0]); assertEquals(2, result.getValues("name-two").length); assertEquals("value-2.1"
svn commit: r485072 - in /lucene/nutch/trunk/src/test/org/apache/nutch: crawl/TestLinkDbMerger.java indexer/TestDeleteDuplicates.java
Author: siren Date: Sat Dec 9 14:20:02 2006 New Revision: 485072 URL: http://svn.apache.org/viewvc?view=rev&rev=485072 Log: generate test resources under build/test Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java?view=diff&rev=485072&r1=485071&r2=485072 == --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java Sat Dec 9 14:20:02 2006 @@ -87,7 +87,7 @@ expected.put(url21, urls21_expected); conf = NutchConfiguration.create(); fs = FileSystem.get(conf); -testDir = new Path("test-crawldb-" + +testDir = new Path("build/test/test-linkdb-" + new java.util.Random().nextInt()); fs.mkdirs(testDir); } @@ -105,8 +105,6 @@ public void testMerge() throws Exception { Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.get(conf); -Path testDir = new Path("test-linkdb-" + -new java.util.Random().nextInt()); fs.mkdirs(testDir); Path linkdb1 = new Path(testDir, "linkdb1"); Path linkdb2 = new Path(testDir, "linkdb2"); Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?view=diff&rev=485072&r1=485071&r2=485072 == --- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Sat Dec 9 14:20:02 2006 @@ -31,7 +31,6 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.nutch.analysis.NutchDocumentAnalyzer; -import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.util.NutchConfiguration; import junit.framework.TestCase; @@ -47,7 +46,7 @@ conf = NutchConfiguration.create(); conf.set("fs.default.name", "local"); fs = FileSystem.get(conf); -root = new Path("dedup2-test-" + new Random().nextInt()); +root = new Path("build/test/dedup2-test-" + new Random().nextInt()); // create test indexes index1 = createIndex("index1", true, 1.0f, 10L); index2 = createIndex("index2", false, 2.0f, 20L);
svn commit: r481738 - in /lucene/nutch/trunk/contrib/web2: README.txt build.xml
Author: siren Date: Sun Dec 3 00:49:48 2006 New Revision: 481738 URL: http://svn.apache.org/viewvc?view=rev&rev=481738 Log: simplify compilation process Modified: lucene/nutch/trunk/contrib/web2/README.txt lucene/nutch/trunk/contrib/web2/build.xml Modified: lucene/nutch/trunk/contrib/web2/README.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/README.txt?view=diff&rev=481738&r1=481737&r2=481738 == --- lucene/nutch/trunk/contrib/web2/README.txt (original) +++ lucene/nutch/trunk/contrib/web2/README.txt Sun Dec 3 00:49:48 2006 @@ -23,34 +23,13 @@ controllers and pojos, jar libraries), ui markup (in form of html, jsp), ui resources css, javascript. -Before compiling core nutch plugins you must edit the -core nutch plugin 'nutch-extensionpoints' plugin.xml -and add following snippet into it: - - - - - - - - - -To compile you need to fist build your nutch (core and plugins) -after that run ant war to generate war. - To compile web2 plugins you must issue command ant compile-plugins After compiling you must enable plugins, please refer to nutch -documentation +documentation + +To build deployable .war issue command ant war. The nutch plugins are not included in the generated war and you need to properly configure where your plugins are. This is achieved @@ -125,23 +104,21 @@ absolute path, must start with /WEB-INF - - - - + + + + Referencing jsp resources inside plugins absolute path, must start with /plugin/ - - - - - - + + + + Static resources Modified: lucene/nutch/trunk/contrib/web2/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/build.xml?view=diff&rev=481738&r1=481737&r2=481738 == --- lucene/nutch/trunk/contrib/web2/build.xml (original) +++ lucene/nutch/trunk/contrib/web2/build.xml Sun Dec 3 00:49:48 2006 @@ -79,7 +79,7 @@ - + + + + + + + + + + + @@ -127,7 +140,8 @@ - +
svn commit: r481445 - in /lucene/nutch/trunk: site/ site/images/ src/site/src/documentation/ src/site/src/documentation/content/xdocs/ src/site/src/documentation/resources/images/
Author: siren Date: Fri Dec 1 15:25:49 2006 New Revision: 481445 URL: http://svn.apache.org/viewvc?view=rev&rev=481445 Log: added lucene logo so site is independent, added link to solr, added link to my apache page in credits Added: lucene/nutch/trunk/site/images/lucene_green_150.gif (with props) lucene/nutch/trunk/src/site/src/documentation/resources/images/lucene_green_150.gif (with props) Modified: lucene/nutch/trunk/site/about.html lucene/nutch/trunk/site/bot.html lucene/nutch/trunk/site/credits.html lucene/nutch/trunk/site/credits.pdf lucene/nutch/trunk/site/i18n.html lucene/nutch/trunk/site/index.html lucene/nutch/trunk/site/issue_tracking.html lucene/nutch/trunk/site/linkmap.html lucene/nutch/trunk/site/linkmap.pdf lucene/nutch/trunk/site/mailing_lists.html lucene/nutch/trunk/site/nightly.html lucene/nutch/trunk/site/tutorial.html lucene/nutch/trunk/site/tutorial8.html lucene/nutch/trunk/site/version_control.html lucene/nutch/trunk/src/site/src/documentation/content/xdocs/credits.xml lucene/nutch/trunk/src/site/src/documentation/content/xdocs/site.xml lucene/nutch/trunk/src/site/src/documentation/skinconf.xml Modified: lucene/nutch/trunk/site/about.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/about.html?view=diff&rev=481445&r1=481444&r2=481445 == --- lucene/nutch/trunk/site/about.html (original) +++ lucene/nutch/trunk/site/about.html Fri Dec 1 15:25:49 2006 @@ -21,7 +21,7 @@ -http://lucene.apache.org/";>http://lucene.apache.org/java/docs/images/lucene_green_150.gif"; title="Apache Lucene"> +http://lucene.apache.org/";> http://lucene.apache.org/nutch/";> @@ -127,6 +127,9 @@ http://lucene.apache.org/hadoop/";>Hadoop + + +http://incubator.apache.org/solr/";>Solr Modified: lucene/nutch/trunk/site/bot.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/bot.html?view=diff&rev=481445&r1=481444&r2=481445 == --- lucene/nutch/trunk/site/bot.html (original) +++ lucene/nutch/trunk/site/bot.html Fri Dec 1 15:25:49 2006 @@ -21,7 +21,7 @@ -http://lucene.apache.org/";>http://lucene.apache.org/java/docs/images/lucene_green_150.gif"; title="Apache Lucene"> +http://lucene.apache.org/";> http://lucene.apache.org/nutch/";> @@ -127,6 +127,9 @@ http://lucene.apache.org/hadoop/";>Hadoop + + +http://incubator.apache.org/solr/";>Solr Modified: lucene/nutch/trunk/site/credits.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.html?view=diff&rev=481445&r1=481444&r2=481445 == --- lucene/nutch/trunk/site/credits.html (original) +++ lucene/nutch/trunk/site/credits.html Fri Dec 1 15:25:49 2006 @@ -21,7 +21,7 @@ -http://lucene.apache.org/";>http://lucene.apache.org/java/docs/images/lucene_green_150.gif"; title="Apache Lucene"> +http://lucene.apache.org/";> http://lucene.apache.org/nutch/";> @@ -128,6 +128,9 @@ http://lucene.apache.org/hadoop/";>Hadoop + +http://incubator.apache.org/solr/";>Solr + @@ -178,7 +181,9 @@ http://www-scf.usc.edu/~mattmann/";>Chris A. Mattmann -Sami Siren + +http://people.apache.org/~siren";>Sami Siren + John Xing @@ -186,7 +191,7 @@ - + Friends @@ -217,7 +222,7 @@ - + Sponsors Modified: lucene/nutch/trunk/site/credits.pdf URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.pdf?view=diff&rev=481445&r1=481444&r2=481445 == --- lucene/nutch/trunk/site/credits.pdf (original) +++ lucene/nutch/trunk/site/credits.pdf Fri Dec 1 15:25:49 2006 @@ -58,10 +58,10 @@ >> endobj 14 0 obj -<< /Length 2280 /Filter [ /ASCII85Decode /FlateDecode ] +<< /Length 2321 /Filter [ /ASCII85Decode /FlateDecode ] >> stream -Gat=-b?![^&Dd([EMAIL PROTECTED](g>8rh)98bLREO<8q+kCLYQbWS$l/LH`\btQ/n-u`VVIQ&afn\c,1ap*Z)a&i/`+[!+V0JT.oNcCqRNCM#Z%Q7thgF3mBRPUaU?!II$k!h0`E(I'[EMAIL PROTECTED]&#JoCC%72pi_(rVhVI2C`JNP"*haZD!Kkq`97s3h<<8`I.lInD8LM\Y<,#I2-"X][4IAapZS@>EZ&mT8Nm8R4TNIZLNET"RF\SWHFbVa,^l2jZjPgmO:>\.ABf1`A4>ACR*XGDan"e0]76F[&WL?,LIXdHp1WR8+Q1M[.m46hgr<`nCaWJFa=We`'pqfds69Y;Sk_f>&\13ol[o=g:4aiE]qFqj^WB`92G%3f&kP"Q*u'ii4[[,h(DtEpcP473CP7g/5LB!FOs_<;5i$peluanbc&!5Ucp$oJ/:$XWp'jS`97BNiWMF#*k=TJ>gFf'+QbGL:@e DFBI62ZBS9&U-_*&[EMAIL PROTECTED],02,gK0kj-iV,*#$A8UroM-[Zj7fp&V9[aI
svn commit: r481437 [2/2] - in /lucene/nutch/trunk/contrib/web2: ./ plugins/ plugins/web-caching-oscache/ plugins/web-clustering/ plugins/web-clustering/src/web/web-clustering/ plugins/web-keymatch/ p
Modified: lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_ca.properties URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_ca.properties?view=diff&rev=481437&r1=481436&r2=481437 == --- lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_ca.properties (original) +++ lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_ca.properties Fri Dec 1 15:01:00 2006 @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + search.title = resultats de cerca search.search = Cerca search.hits = Coincidències {0}-{1} (d'un total de {2} documents coincidents): Modified: lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_de.properties URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_de.properties?view=diff&rev=481437&r1=481436&r2=481437 == --- lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_de.properties (original) +++ lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_de.properties Fri Dec 1 15:01:00 2006 @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + lang=de anchors.title = Verweise anchors.anchors = Herzeigende Link-Texte: Modified: lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_en.properties URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_en.properties?view=diff&rev=481437&r1=481436&r2=481437 == --- lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_en.properties (original) +++ lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_en.properties Fri Dec 1 15:01:00 2006 @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + #This is the default resource file for nutch ui localization. #If you create a new localized version of resources, please use #this as the base @@ -96,4 +111,4 @@ preferences.numResults.info= #text on save button -preferences.submit=Save and return to search \ No newline at end of file +preferences.submit=Save and return to search Modified: lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_es.properties URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/src/main/resources/org/nutch/jsp/resources_es.properties?view=diff&rev=481437&r1=481436&r2=481437 ===
svn commit: r477786 - in /lucene/nutch/trunk: CHANGES.txt conf/parse-plugins.xml
Author: siren Date: Tue Nov 21 09:51:57 2006 New Revision: 477786 URL: http://svn.apache.org/viewvc?view=rev&rev=477786 Log: NUTCH-362 Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/parse-plugins.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=477786&r1=477785&r2=477786 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Nov 21 09:51:57 2006 @@ -83,6 +83,9 @@ 27. NUTCH-405 - Content object is not properly initialized in map method of ParseSegment (siren) +28. NUTCH-362 - Remove parse-text from unsupported filetypes in +parse-plugins.xml (siren) + Release 0.8 - 2006-07-25 0. Totally new architecture, based on hadoop Modified: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/parse-plugins.xml?view=diff&rev=477786&r1=477785&r2=477786 == --- lucene/nutch/trunk/conf/parse-plugins.xml (original) +++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Nov 21 09:51:57 2006 @@ -22,16 +22,6 @@ - - - - - - - - - @@ -46,7 +36,6 @@ - @@ -113,20 +102,8 @@ - - - - - - - - - - - - - + @@ -145,7 +122,6 @@ - @@ -158,66 +134,14 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -238,30 +162,15 @@ - - - - - - - - - - - - - - -
svn commit: r477757 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/parse/ParseSegment.java src/java/org/apache/nutch/protocol/Content.java
Author: siren Date: Tue Nov 21 09:19:51 2006 New Revision: 477757 URL: http://svn.apache.org/viewvc?view=rev&rev=477757 Log: NUTCH-405 Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=477757&r1=477756&r2=477757 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Nov 21 09:19:51 2006 @@ -80,6 +80,9 @@ 26. NUTCH-403 - Make URL filtering optional in Generator (siren) +27. NUTCH-405 - Content object is not properly initialized in map method +of ParseSegment (siren) + Release 0.8 - 2006-07-25 0. Totally new architecture, based on hadoop Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?view=diff&rev=477757&r1=477756&r2=477757 == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue Nov 21 09:19:51 2006 @@ -66,7 +66,8 @@ newKey.set(key.toString()); key = newKey; } -Content content = (Content)value; +Content content = (Content) value; +content.forceInflate(); Parse parse = null; ParseStatus status; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?view=diff&rev=477757&r1=477756&r2=477757 == --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Tue Nov 21 09:19:51 2006 @@ -298,4 +298,12 @@ return typeName; } + /** + * By calling this method caller forces the next access to any property (via + * getters and setters) to check if decompressing of data is really required. + */ + public void forceInflate() { +inflated = false; + } + }
svn commit: r476879 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/test/ src/test/org/apache/nutch/crawl/ src/test/org/apache/nutch/fetcher/
Author: siren Date: Sun Nov 19 10:48:39 2006 New Revision: 476879 URL: http://svn.apache.org/viewvc?view=rev&rev=476879 Log: NUTCH-403 Make URL filtering optional in Generator Added: lucene/nutch/trunk/src/test/filter-all.txt Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=476879&r1=476878&r2=476879 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Nov 19 10:48:39 2006 @@ -78,6 +78,8 @@ 25. NUTCH-404 - Fix LinkDB Usage - implementation mismatch (siren) +26. NUTCH-403 - Make URL filtering optional in Generator (siren) + Release 0.8 - 2006-07-25 0. Totally new architecture, based on hadoop Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=476879&r1=476878&r2=476879 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Sun Nov 19 10:48:39 2006 @@ -115,9 +115,8 @@ injector.inject(crawlDb, rootUrlDir); for (int i = 0; i < depth; i++) { // generate new segment - Path segment = -generator.generate(crawlDb, segments, -1, - topN, System.currentTimeMillis()); + Path segment = generator.generate(crawlDb, segments, -1, topN, System + .currentTimeMillis(), false); fetcher.fetch(segment, threads); // fetch it if (!Fetcher.isParsing(job)) { parseSegment.parse(segment);// parse it, if needed Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=476879&r1=476878&r2=476879 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Sun Nov 19 10:48:39 2006 @@ -44,6 +44,7 @@ /** Generates a subset of a crawl db to fetch. */ public class Generator extends ToolBase { + public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter"; public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip"; public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host"; public static final String CRAWL_TOP_N = "crawl.topN"; @@ -89,6 +90,7 @@ private FloatWritable sortValue = new FloatWritable(); private boolean byIP; private long dnsFailure = 0L; +private boolean filter; public void configure(JobConf job) { curTime = job.getLong(CRAWL_GEN_CUR_TIME, System.currentTimeMillis()); @@ -99,6 +101,7 @@ normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); scfilters = new ScoringFilters(job); hostPartitioner.configure(job); + filter = job.getBoolean(CRAWL_GENERATE_FILTER, true); } public void close() {} @@ -108,13 +111,16 @@ OutputCollector output, Reporter reporter) throws IOException { Text url = (Text)key; - // don't generate URLs that don't pass URLFilters - try { -if (filters.filter(url.toString()) == null) - return; - } catch (URLFilterException e) { -if (LOG.isWarnEnabled()) { - LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")"); + if (filter) { +// If filtering is on don't generate URLs that don't pass URLFilters +try { + if (filters.filter(url.toString()) == null) +return; +} catch (URLFilterException e) { + if (LOG.isWarnEnabled()) { +LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() ++ ")"); + } } } CrawlDatum crawlDatum = (CrawlDatum)value; @@ -291,13 +297,13 @@ /** Generate fetchlists in a segment. */ public Path generate(Path dbDir, Path segments) throws IOException { -return generate(dbDir, segments, --1, Long.MAX_VALUE, System.currentTimeMillis()); +return generate(dbDir, se
svn commit: r476814 - /lucene/nutch/trunk/CHANGES.txt
Author: siren Date: Sun Nov 19 05:13:54 2006 New Revision: 476814 URL: http://svn.apache.org/viewvc?view=rev&rev=476814 Log: NUTCH-404 Fix LinkDB Usage - implementation mismatch Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=476814&r1=476813&r2=476814 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Nov 19 05:13:54 2006 @@ -76,6 +76,8 @@ 24. NUTCH-388 - nutch-default.xml has outdated example for urlfilter.order (reported by Jared Dunne) +25. NUTCH-404 - Fix LinkDB Usage - implementation mismatch (siren) + Release 0.8 - 2006-07-25 0. Totally new architecture, based on hadoop
svn commit: r476810 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
Author: siren Date: Sun Nov 19 04:54:29 2006 New Revision: 476810 URL: http://svn.apache.org/viewvc?view=rev&rev=476810 Log: NUTCH-404 Fix LinkDB Usage - implementation mismatch Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?view=diff&rev=476810&r1=476809&r2=476810 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Sun Nov 19 04:54:29 2006 @@ -329,12 +329,12 @@ public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("Usage: LinkDb (-dir | ...) [-noNormalizing] [-noFiltering]"); + System.err.println("Usage: LinkDb (-dir | ...) [-noNormalize] [-noFilter]"); System.err.println("\tlinkdb\toutput LinkDb to create or update"); System.err.println("\t-dir segmentsDir\tparent directory of several segments, OR"); System.err.println("\tseg1 seg2 ...\t list of segment directories"); - System.err.println("\t-noNormalizing\tdon't normalize link URLs"); - System.err.println("\t-noFiltering\tdon't apply URLFilters to link URLs"); + System.err.println("\t-noNormalize\tdon't normalize link URLs"); + System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs"); return -1; } Path segDir = null; @@ -370,7 +370,5 @@ return -1; } } - - }
svn commit: r476617 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml
Author: siren Date: Sat Nov 18 13:55:44 2006 New Revision: 476617 URL: http://svn.apache.org/viewvc?view=rev&rev=476617 Log: NUTCH-388 Fix description of urlfilter.order Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=476617&r1=476616&r2=476617 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Nov 18 13:55:44 2006 @@ -73,6 +73,9 @@ 23. NUTCH-395 - Increase fetching speed (siren) +24. NUTCH-388 - nutch-default.xml has outdated example for urlfilter.order +(reported by Jared Dunne) + Release 0.8 - 2006-07-25 0. Totally new architecture, based on hadoop Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=476617&r1=476616&r2=476617 == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Sat Nov 18 13:55:44 2006 @@ -780,7 +780,7 @@ plugin-includes and plugin-excludes above) are loaded and applied in system defined order. If not empty, only named filters are loaded and applied in given order. For example, if this property has value: - org.apache.nutch.net.RegexURLFilter org.apache.nutch.net.PrefixURLFilter + org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter then RegexURLFilter is applied first, and PrefixURLFilter second. Since all filters are AND'ed, filter ordering does not have impact on end result, but it may have performance implication, depending
svn commit: r475382 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Author: siren Date: Wed Nov 15 11:49:55 2006 New Revision: 475382 URL: http://svn.apache.org/viewvc?view=rev&rev=475382 Log: oh well Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=475382&r1=475381&r2=475382 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Nov 15 11:49:55 2006 @@ -91,10 +91,10 @@ private long dnsFailure = 0L; public void configure(JobConf job) { - curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis()); - limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks(); - maxPerHost = job.getInt("generate.max.per.host", -1); - byIP = job.getBoolean("generate.max.per.host.by.ip", false); + curTime = job.getLong(CRAWL_GEN_CUR_TIME, System.currentTimeMillis()); + limit = job.getLong(CRAWL_TOP_N,Long.MAX_VALUE)/job.getNumReduceTasks(); + maxPerHost = job.getInt(GENERATE_MAX_PER_HOST, -1); + byIP = job.getBoolean(GENERATE_MAX_PER_HOST_BY_IP, false); filters = new URLFilters(job); normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); scfilters = new ScoringFilters(job);
svn commit: r475380 - /lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
Author: siren Date: Wed Nov 15 11:43:48 2006 New Revision: 475380 URL: http://svn.apache.org/viewvc?view=rev&rev=475380 Log: added more junit tests Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=475380&r1=475379&r2=475380 == --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Wed Nov 15 11:43:48 2006 @@ -182,7 +182,8 @@ } /** - * Test that generator obeys the property "generate.max.per.host". + * Test that generator obeys the property "generate.max.per.host" and + * "generate.max.per.host.by.ip". * @throws Exception */ public void testGenerateHostIPLimit() throws Exception{
svn commit: r475378 - in /lucene/nutch/trunk/src: java/org/apache/nutch/crawl/Generator.java test/org/apache/nutch/crawl/TestGenerator.java
Author: siren Date: Wed Nov 15 11:42:22 2006 New Revision: 475378 URL: http://svn.apache.org/viewvc?view=rev&rev=475378 Log: added more junit tests Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=475378&r1=475377&r2=475378 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Nov 15 11:42:22 2006 @@ -44,6 +44,10 @@ /** Generates a subset of a crawl db to fetch. */ public class Generator extends ToolBase { + public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip"; + public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host"; + public static final String CRAWL_TOP_N = "crawl.topN"; + public static final String CRAWL_GEN_CUR_TIME = "crawl.gen.curTime"; public static final Log LOG = LogFactory.getLog(Generator.class); public static class SelectorEntry implements Writable { Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=475378&r1=475377&r2=475378 == --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Wed Nov 15 11:42:22 2006 @@ -31,12 +31,10 @@ import junit.framework.TestCase; /** - * Basic generator test: - * 1. Insert entries in crawldb - * 2. Generates entries to fetch - * 3. Verifies that number of generated urls match - * 4. Verifies that highest scoring urls are generated - + * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to + * fetch 3. Verifies that number of generated urls match 4. Verifies that + * highest scoring urls are generated + * * @author nutch-dev * */ @@ -50,11 +48,11 @@ FileSystem fs; - final static Path testdir=new Path("build/test/generator-test"); + final static Path testdir = new Path("build/test/generator-test"); protected void setUp() throws Exception { conf = CrawlDBTestUtil.createConfiguration(); -fs=FileSystem.get(conf); +fs = FileSystem.get(conf); fs.delete(testdir); } @@ -70,81 +68,243 @@ } /** - * Test that generator generates fetchlish ordered by score (desc) - * + * Test that generator generates fetchlish ordered by score (desc). + * * @throws Exception */ public void testGenerateHighest() throws Exception { -int NUM_RESULTS=2; - +final int NUM_RESULTS = 2; + ArrayList list = new ArrayList(); - -for(int i=0;i<=100;i++){ - list.add(new CrawlDBTestUtil.URLCrawlDatum(new Text("http://aaa/"; + pad(i)), -new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 1, i))); + +for (int i = 0; i <= 100; i++) { + list.add(createURLCrawlDatum("http://aaa/"; + pad(i), + 1, i)); } - -dbDir = new Path(testdir, "crawldb"); -segmentsDir = new Path(testdir, "segments"); -fs.mkdirs(dbDir); -fs.mkdirs(segmentsDir); - -// create crawldb -CrawlDBTestUtil.createCrawlDb(fs, dbDir, list); - -// generate segment -Generator g=new Generator(conf); -Path generatedSegment=g.generate(dbDir, segmentsDir, -1, NUM_RESULTS, Long.MAX_VALUE); - -Path fetchlist=new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-0"); - -// verify results -SequenceFile.Reader reader=new SequenceFile.Reader(fs, fetchlist, conf); - -ArrayList l=new ArrayList(); - -READ: - do { - Text key=new Text(); - CrawlDatum value=new CrawlDatum(); - if(!reader.next(key, value)) break READ; - l.add(new URLCrawlDatum(key, value)); -} while(true); -reader.close(); +createCrawlDB(list); +Path generatedSegment = generateFetchlist(NUM_RESULTS, conf); + +Path fetchlist = new Path(new Path(generatedSegment, +CrawlDatum.GENERATE_DIR_NAME), "part-0"); + +ArrayList l = readContents(fetchlist); + // sort urls by score desc Collections.sort(l, new ScoreComparator()); -//verify we got right amount of records +// verify we got right amount of records assertEquals(NUM_RESULTS, l.size()); -//verify we have the highest scoring urls +// verify we have the highest scoring urls
svn commit: r474464 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/protocol/ src/test/org/apache/nutch/metadata/ src/test/org/apache/nutch/protocol/
Author: siren Date: Mon Nov 13 11:46:56 2006 New Revision: 474464 URL: http://svn.apache.org/viewvc?view=rev&rev=474464 Log: NUTCH-395 Increase fetching speed Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=474464&r1=474463&r2=474464 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Nov 13 11:46:56 2006 @@ -71,6 +71,7 @@ 22. NUTCH-399 - Change CommandRunner to use concurrent api from jdk (siren) +23. NUTCH-395 - Increase fetching speed (siren) Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=474464&r1=474463&r2=474464 == --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Mon Nov 13 11:46:56 2006 @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -16,103 +16,58 @@ */ package org.apache.nutch.metadata; -// JDK imports import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; -import java.lang.reflect.Field; -import java.lang.reflect.Modifier; -import java.text.SimpleDateFormat; -import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; -import java.util.Iterator; -import java.util.List; import java.util.Map; import java.util.Properties; - -// Commons Lang imports -import org.apache.commons.lang.StringUtils; - -// Hadoop imports import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; /** - * A syntax tolerant and multi-valued metadata container. - * - * All the static String fields declared by this class are used as reference - * names for syntax correction on meta-data naming. + * A multi-valued metadata container. * * @author Chris Mattmann * @author Jérôme Charron + * */ -public class Metadata implements CreativeCommons, - DublinCore, - HttpHeaders, - Nutch, - Office, - Writable { - - - /** Used to format DC dates for the DATE metadata field */ - public final static SimpleDateFormat DATE_FORMAT = - new SimpleDateFormat("-MM-dd"); - - - private final static Map NAMES_IDX = new HashMap(); - private static String[] normalized = null; - - // Uses self introspection to fill the metanames index and the - // metanames list. - static { -Field[] fields = Metadata.class.getFields(); -for (int i=0; i metadata = null; + - - /** Constructs a new, empty metadata. */ + /** + * Constructs a new, empty metadata. + */ public Metadata() { -metadata = new HashMap(); +metadata = new HashMap(); } /** + * Returns true if named value is multivalued. + * @param name name of metadata + * @return true is named value is multivalued, false if single + * value or null */ - public boolean isMultiValued(String name) { -return getValues(name).length > 1; + public boolean isMultiValued(final String name) { +return metadata.get(name) != null && metadata.get(name).length > 1; } /** * Returns an array of the names contained in the metadata. + * @return Metadata names */ public String[] names() { -Iterator iter = metadata.keySet().iterator(); -List names = new ArrayList(); -while(iter.hasNext()) { - names.add(getNormalizedName((String) iter.next())); -} -return (String[]) names.toArray(new String[names.size()]); +return metadata.keySet().toArray(new String[metadata.keySet().size()]); } - + /** * Get the value associated to a metadata name. * If many values are assiociated to the specified name, then the first @@ -121,12 +76,12 @@ * @param name of the metadata. * @return the value associated to the specified metadata name. */ -
svn commit: r473955 - in /lucene/nutch/trunk: contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/ contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/
Author: siren Date: Sun Nov 12 04:34:17 2006 New Revision: 473955 URL: http://svn.apache.org/viewvc?view=rev&rev=473955 Log: NUTCH-400 add missing header to .java Modified: lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/CustomDiskPersistenceListener.java lucene/nutch/trunk/contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/ClusteringCheckboxController.java lucene/nutch/trunk/contrib/web2/src/main/java/org/apache/nutch/webapp/common/ServletContextServiceLocator.java lucene/nutch/trunk/contrib/web2/src/main/java/org/apache/nutch/webapp/common/Startable.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java lucene/nutch/trunk/src/test/org/apache/nutch/searcher/DistributedSearchTest.java lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java Modified: lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/CustomDiskPersistenceListener.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/CustomDiskPersistenceListener.java?view=diff&rev=473955&r1=473954&r2=473955 == --- lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/CustomDiskPersistenceListener.java (original) +++ lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/java/org/apache/nutch/cache/CustomDiskPersistenceListener.java Sun Nov 12 04:34:17 2006 @@ -1,3 +1,19 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ package org.apache.nutch.cache; import com.opensymphony.oscache.plugins.diskpersistence.AbstractDiskPersistenceListener; Modified: lucene/nutch/trunk/contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/ClusteringCheckboxController.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/ClusteringCheckboxController.java?view=diff&rev=473955&r1=473954&r2=473955 == --- lucene/nutch/trunk/contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/ClusteringCheckboxController.java (original) +++ lucene/nutch/trunk/contrib/web2/plugins/web-clustering/src/java/org/apache/nutch/webapp/controller/ClusteringCheckboxController.java Sun Nov 12 04:34:17 2006 @@ -1,3 +1,19 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ package org.apache.nutch.webapp.controller; import java.io.IOExcept
svn commit: r473727 - /lucene/nutch/trunk/CHANGES.txt
Author: siren Date: Sat Nov 11 07:27:40 2006 New Revision: 473727 URL: http://svn.apache.org/viewvc?view=rev&rev=473727 Log: NUTCH-399 Change CommandRunner to use concurrent api from jdk Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=473727&r1=473726&r2=473727 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Nov 11 07:27:40 2006 @@ -69,6 +69,8 @@ 21. NUTCH-361, NUTCH-136 - When jobtracker is 'local' generate only one partition. (ab) +22. NUTCH-399 - Change CommandRunner to use concurrent api from jdk (siren) + Release 0.8 - 2006-07-25