Author: ab Date: Thu Dec 22 17:16:31 2005 New Revision: 358674 URL: http://svn.apache.org/viewcvs?rev=358674&view=rev Log: Remove traces of the old API FetcherOutput.
The old IndexSegment is now marked broken. In the next step old utilities should be removed. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java?rev=358674&r1=358673&r2=358674&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java Thu Dec 22 17:16:31 2005 @@ -213,15 +213,8 @@ // } try { - // dummy up a FetcherOutput so that we can use existing indexing filters - // TODO: modify IndexingFilter interface to use Inlinks, etc. - FetcherOutput fo = - new FetcherOutput(new FetchListEntry(true,new Page((UTF8)key),anchors), - null, null); - fo.setFetchDate(fetchDatum.getFetchTime()); - // run indexing filters - doc = IndexingFilters.filter(doc,new ParseImpl(parseText, parseData),fo); + doc = IndexingFilters.filter(doc,new ParseImpl(parseText, parseData), (UTF8)key, fetchDatum, inlinks); } catch (IndexingException e) { LOG.warning("Error indexing "+key+": "+e); return; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=358674&r1=358673&r2=358674&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java Thu Dec 22 17:16:31 2005 @@ -75,6 +75,7 @@ public void setScorePower(float power) { scorePower = power; } public void indexPages() throws Exception { +/* // // First, see if it's ever been indexed before // @@ -184,6 +185,7 @@ float eps = (float) count / (float) (delta / 1000); LOG.info("DONE indexing segment " + srcDir.getName() + ": total " + total + " records in " + ((float) delta / 1000f) + " s (" + eps + " rec/s)."); +*/ } /** @@ -229,6 +231,9 @@ * Create an index for the input files in the named directory. */ public static void main(String[] args) throws Exception { + System.err.println("ERROR: use org.apache.nutch.crawl.Indexer instead."); + System.exit(0); + String usage = "IndexSegment (-local | -ndfs <namenode:port>) <segment_directory> [-dir <workingdir>]"; if (args.length == 0) { System.err.println("Usage: " + usage); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Thu Dec 22 17:16:31 2005 @@ -18,7 +18,10 @@ import org.apache.lucene.document.Document; import org.apache.nutch.parse.Parse; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.fetcher.FetcherOutput; +import org.apache.nutch.io.UTF8; /** Extension point for indexing. Permits one to add metadata to the indexed * fields. All plugins found which implement this extension point are run @@ -28,8 +31,18 @@ /** The name of the extension point. */ final static String X_POINT_ID = IndexingFilter.class.getName(); - /** Adds fields or otherwise modifies the document that will be indexed for a - * parse. */ - Document filter(Document doc, Parse parse, FetcherOutput fo) + /** + * Adds fields or otherwise modifies the document that will be indexed for a + * parse. + * + * @param doc document instance for collecting fields + * @param parse parse data instance + * @param url page url + * @param datum crawl datum for the page + * @param inlinks page inlinks + * @return modified (or a new) document instance + * @throws IndexingException + */ + Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?rev=358674&r1=358673&r2=358674&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Thu Dec 22 17:16:31 2005 @@ -22,7 +22,10 @@ import org.apache.nutch.plugin.*; import org.apache.nutch.parse.Parse; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.fetcher.FetcherOutput; +import org.apache.nutch.io.UTF8; /** Creates and caches [EMAIL PROTECTED] IndexingFilter} implementing plugins.*/ public class IndexingFilters { @@ -39,6 +42,7 @@ for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; IndexingFilter filter = (IndexingFilter)extension.getExtensionInstance(); + System.out.println("-adding " + filter.getClass().getName()); if (!filterMap.containsKey(filter.getClass().getName())) { filterMap.put(filter.getClass().getName(), filter); } @@ -52,11 +56,11 @@ private IndexingFilters() {} // no public ctor /** Run all defined filters. */ - public static Document filter(Document doc, Parse parse, FetcherOutput fo) + public static Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { for (int i = 0; i < CACHE.length; i++) { - doc = CACHE[i].filter(doc, parse, fo); + doc = CACHE[i].filter(doc, parse, url, datum, inlinks); } return doc; Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Thu Dec 22 17:16:31 2005 @@ -23,7 +23,10 @@ import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.io.UTF8; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.fetcher.FetcherOutput; import org.apache.nutch.pagedb.FetchListEntry; @@ -42,16 +45,16 @@ /** The name of the document field we use. */ public static String FIELD = "cc"; - public Document filter(Document doc, Parse parse, FetcherOutput fo) + public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // index the license String licenseUrl = parse.getData().get("License-Url"); if (licenseUrl != null) { - LOG.info("CC: indexing "+licenseUrl+" for: "+fo.getUrl()); + LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString()); // add the entire license as cc:license=xxx - addFeature(doc, "license="+licenseUrl); + addFeature(doc, "license=" + licenseUrl); // index license attributes extracted of the license url addUrlFeatures(doc, licenseUrl); @@ -60,7 +63,7 @@ // index the license location as cc:meta=xxx String licenseLocation = parse.getData().get("License-Location"); if (licenseLocation != null) { - addFeature(doc, "meta="+licenseLocation); + addFeature(doc, "meta=" + licenseLocation); } // index the work type cc:type=xxx @@ -91,7 +94,7 @@ addFeature(doc, feature); } } catch (MalformedURLException e) { - LOG.warning("CC: failed to parse url: "+urlString+" : "+e); + LOG.warning("CC: failed to parse url: " + urlString + " : " + e); } } Modified: lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Thu Dec 22 17:16:31 2005 @@ -23,10 +23,14 @@ import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.io.UTF8; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.fetcher.FetcherOutput; import org.apache.nutch.pagedb.FetchListEntry; +import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.logging.Logger; @@ -41,13 +45,12 @@ private static final int MAX_TITLE_LENGTH = NutchConf.get().getInt("indexer.max.title.length", 100); - public Document filter(Document doc, Parse parse, FetcherOutput fo) + public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { - String url = fo.getUrl().toString(); String host = null; try { - URL u = new URL(url); + URL u = new URL(url.toString()); host = u.getHost(); } catch (MalformedURLException e) { throw new IndexingException(e); @@ -62,15 +65,19 @@ // url is both stored and indexed, so it's both searchable and returned - doc.add(Field.Text("url", url)); + doc.add(Field.Text("url", url.toString())); // content is indexed, so that it's searchable, but not stored in index doc.add(Field.UnStored("content", parse.getText())); // anchors are indexed, so they're searchable, but not stored in index - String[] anchors = fo.getAnchors(); - for (int i = 0; i < anchors.length; i++) { - doc.add(Field.UnStored("anchor", anchors[i])); + try { + String[] anchors = inlinks.getAnchors(); + for (int i = 0; i < anchors.length; i++) { + doc.add(Field.UnStored("anchor", anchors[i])); + } + } catch (IOException ioe) { + LOG.warning("BasicIndexingFilter: can't get anchors for " + url.toString()); } // title Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu Dec 22 17:16:31 2005 @@ -33,7 +33,10 @@ import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.io.UTF8; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.fetcher.FetcherOutput; import org.apache.nutch.util.NutchConf; @@ -81,21 +84,20 @@ MimeTypes.get(NutchConf.get().get("mime.types.file")); - public Document filter(Document doc, Parse parse, FetcherOutput fo) + public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { - String url = fo.getUrl().toString(); - + String url_s = url.toString(); // normalize metaData (see note in the method below). ContentProperties metaData = normalizeMeta(parse.getData().getMetadata()); - addTime(doc, metaData, url, fo); + addTime(doc, metaData, url_s, datum); - addLength(doc, metaData, url); + addLength(doc, metaData, url_s); - addType(doc, metaData, url); + addType(doc, metaData, url_s); - resetTitle(doc, metaData, url); + resetTitle(doc, metaData, url_s); return doc; } @@ -103,7 +105,7 @@ // Add time related meta info. Add last-modified if present. Index date as // last-modified, or, if that's not present, use fetch time. private Document addTime(Document doc, ContentProperties metaData, String url, - FetcherOutput fo) { + CrawlDatum datum) { long time = -1; String lastModified = metaData.getProperty("last-modified"); @@ -114,7 +116,7 @@ } if (time == -1) { // if no last-modified - time = fo.getFetchDate(); // use fetch time + time = datum.getFetchTime(); // use fetch time } // add support for query syntax date: Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Thu Dec 22 17:16:31 2005 @@ -17,9 +17,12 @@ // Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.fetcher.FetcherOutput; import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.io.UTF8; import org.apache.nutch.parse.Parse; // Lucene imports @@ -54,7 +57,7 @@ } // Inherited JavaDoc - public Document filter(Document doc, Parse parse, FetcherOutput fo) + public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { //check if X-meta-lang found, possibly put there by HTMLLanguageParser ------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Do you grep through log files for problems? Stop! Download the new AJAX search engine that makes searching your log files as easy as surfing the web. DOWNLOAD SPLUNK! http://ads.osdn.com/?ad_id=7637&alloc_id=16865&op=click _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs