j...

ab Thu, 22 Dec 2005 17:17:29 -0800

Author: ab
Date: Thu Dec 22 17:16:31 2005
New Revision: 358674

URL: http://svn.apache.org/viewcvs?rev=358674&view=rev
Log:
Remove traces of the old API FetcherOutput.


The old IndexSegment is now marked broken. In the next step old utilities
should be removed.

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
    
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
    
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
    
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java Thu Dec 22 
17:16:31 2005
@@ -213,15 +213,8 @@
 //     }
 
     try {
-      // dummy up a FetcherOutput so that we can use existing indexing filters
-      // TODO: modify IndexingFilter interface to use Inlinks, etc. 
-      FetcherOutput fo =
-        new FetcherOutput(new FetchListEntry(true,new Page((UTF8)key),anchors),
-                          null, null);
-      fo.setFetchDate(fetchDatum.getFetchTime());
-
       // run indexing filters
-      doc = IndexingFilters.filter(doc,new ParseImpl(parseText, parseData),fo);
+      doc = IndexingFilters.filter(doc,new ParseImpl(parseText, parseData), 
(UTF8)key, fetchDatum, inlinks);
     } catch (IndexingException e) {
       LOG.warning("Error indexing "+key+": "+e);
       return;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java Thu 
Dec 22 17:16:31 2005
@@ -75,6 +75,7 @@
   public void setScorePower(float power) { scorePower = power; }
 
   public void indexPages() throws Exception {
+/*
       //
       // First, see if it's ever been indexed before
       //
@@ -184,6 +185,7 @@
       float eps = (float) count / (float) (delta / 1000);
       LOG.info("DONE indexing segment " + srcDir.getName() + ": total " + 
total +
               " records in " + ((float) delta / 1000f) + " s (" + eps + " 
rec/s).");
+*/
   }
 
   /** 
@@ -229,6 +231,9 @@
    * Create an index for the input files in the named directory. 
    */
   public static void main(String[] args) throws Exception {
+      System.err.println("ERROR: use org.apache.nutch.crawl.Indexer instead.");
+      System.exit(0);
+
       String usage = "IndexSegment (-local | -ndfs <namenode:port>) 
<segment_directory> [-dir <workingdir>]";
       if (args.length == 0) {
           System.err.println("Usage: " + usage);

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java 
Thu Dec 22 17:16:31 2005
@@ -18,7 +18,10 @@
 
 import org.apache.lucene.document.Document;
 import org.apache.nutch.parse.Parse;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.fetcher.FetcherOutput;
+import org.apache.nutch.io.UTF8;
 
 /** Extension point for indexing.  Permits one to add metadata to the indexed
  * fields.  All plugins found which implement this extension point are run
@@ -28,8 +31,18 @@
   /** The name of the extension point. */
   final static String X_POINT_ID = IndexingFilter.class.getName();
 
-  /** Adds fields or otherwise modifies the document that will be indexed for a
-   * parse. */
-  Document filter(Document doc, Parse parse, FetcherOutput fo)
+  /**
+   * Adds fields or otherwise modifies the document that will be indexed for a
+   * parse.
+   * 
+   * @param doc document instance for collecting fields
+   * @param parse parse data instance
+   * @param url page url
+   * @param datum crawl datum for the page
+   * @param inlinks page inlinks
+   * @return modified (or a new) document instance
+   * @throws IndexingException
+   */
+  Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, 
Inlinks inlinks)
     throws IndexingException;
 }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java 
Thu Dec 22 17:16:31 2005
@@ -22,7 +22,10 @@
 
 import org.apache.nutch.plugin.*;
 import org.apache.nutch.parse.Parse;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.fetcher.FetcherOutput;
+import org.apache.nutch.io.UTF8;
 
 /** Creates and caches [EMAIL PROTECTED] IndexingFilter} implementing 
plugins.*/
 public class IndexingFilters {
@@ -39,6 +42,7 @@
       for (int i = 0; i < extensions.length; i++) {
         Extension extension = extensions[i];
         IndexingFilter filter = 
(IndexingFilter)extension.getExtensionInstance();
+        System.out.println("-adding " + filter.getClass().getName());
         if (!filterMap.containsKey(filter.getClass().getName())) {
                filterMap.put(filter.getClass().getName(), filter);
         }
@@ -52,11 +56,11 @@
   private  IndexingFilters() {}                  // no public ctor
 
   /** Run all defined filters. */
-  public static Document filter(Document doc, Parse parse, FetcherOutput fo)
+  public static Document filter(Document doc, Parse parse, UTF8 url, 
CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
 
     for (int i = 0; i < CACHE.length; i++) {
-      doc = CACHE[i].filter(doc, parse, fo);
+      doc = CACHE[i].filter(doc, parse, url, datum, inlinks);
     }
 
     return doc;

Modified: 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
 Thu Dec 22 17:16:31 2005
@@ -23,7 +23,10 @@
 
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.io.UTF8;
 
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.fetcher.FetcherOutput;
 import org.apache.nutch.pagedb.FetchListEntry;
 
@@ -42,16 +45,16 @@
   /** The name of the document field we use. */
   public static String FIELD = "cc";
 
-  public Document filter(Document doc, Parse parse, FetcherOutput fo)
+  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum 
datum, Inlinks inlinks)
     throws IndexingException {
     
     // index the license
     String licenseUrl = parse.getData().get("License-Url");
     if (licenseUrl != null) {
-      LOG.info("CC: indexing "+licenseUrl+" for: "+fo.getUrl());
+      LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
 
       // add the entire license as cc:license=xxx
-      addFeature(doc, "license="+licenseUrl);
+      addFeature(doc, "license=" + licenseUrl);
 
       // index license attributes extracted of the license url
       addUrlFeatures(doc, licenseUrl);
@@ -60,7 +63,7 @@
     // index the license location as cc:meta=xxx
     String licenseLocation = parse.getData().get("License-Location");
     if (licenseLocation != null) {
-      addFeature(doc, "meta="+licenseLocation);
+      addFeature(doc, "meta=" + licenseLocation);
     }
 
     // index the work type cc:type=xxx
@@ -91,7 +94,7 @@
         addFeature(doc, feature);
       }
     } catch (MalformedURLException e) {
-      LOG.warning("CC: failed to parse url: "+urlString+" : "+e);
+      LOG.warning("CC: failed to parse url: " + urlString + " : " + e);
     }
   }
   

Modified: 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 Thu Dec 22 17:16:31 2005
@@ -23,10 +23,14 @@
 
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.io.UTF8;
 
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.fetcher.FetcherOutput;
 import org.apache.nutch.pagedb.FetchListEntry;
 
+import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.logging.Logger;
@@ -41,13 +45,12 @@
   private static final int MAX_TITLE_LENGTH =
     NutchConf.get().getInt("indexer.max.title.length", 100);
 
-  public Document filter(Document doc, Parse parse, FetcherOutput fo)
+  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum 
datum, Inlinks inlinks)
     throws IndexingException {
     
-    String url = fo.getUrl().toString();
     String host = null;
     try {
-      URL u = new URL(url);
+      URL u = new URL(url.toString());
       host = u.getHost();
     } catch (MalformedURLException e) {
       throw new IndexingException(e);
@@ -62,15 +65,19 @@
 
 
     // url is both stored and indexed, so it's both searchable and returned
-    doc.add(Field.Text("url", url));
+    doc.add(Field.Text("url", url.toString()));
     
     // content is indexed, so that it's searchable, but not stored in index
     doc.add(Field.UnStored("content", parse.getText()));
     
     // anchors are indexed, so they're searchable, but not stored in index
-    String[] anchors = fo.getAnchors();
-    for (int i = 0; i < anchors.length; i++) {
-      doc.add(Field.UnStored("anchor", anchors[i]));
+    try {
+      String[] anchors = inlinks.getAnchors();
+      for (int i = 0; i < anchors.length; i++) {
+        doc.add(Field.UnStored("anchor", anchors[i]));
+      }
+    } catch (IOException ioe) {
+      LOG.warning("BasicIndexingFilter: can't get anchors for " + 
url.toString());
     }
 
     // title

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Thu Dec 22 17:16:31 2005
@@ -33,7 +33,10 @@
 
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.io.UTF8;
 
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.fetcher.FetcherOutput;
 
 import org.apache.nutch.util.NutchConf;
@@ -81,21 +84,20 @@
         MimeTypes.get(NutchConf.get().get("mime.types.file"));
 
   
-  public Document filter(Document doc, Parse parse, FetcherOutput fo)
+  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum 
datum, Inlinks inlinks)
     throws IndexingException {
 
-    String url = fo.getUrl().toString();
-
+    String url_s = url.toString();
     // normalize metaData (see note in the method below).
     ContentProperties metaData = normalizeMeta(parse.getData().getMetadata());
 
-    addTime(doc, metaData, url, fo);
+    addTime(doc, metaData, url_s, datum);
 
-    addLength(doc, metaData, url);
+    addLength(doc, metaData, url_s);
 
-    addType(doc, metaData, url);
+    addType(doc, metaData, url_s);
 
-    resetTitle(doc, metaData, url);
+    resetTitle(doc, metaData, url_s);
 
     return doc;
   }
@@ -103,7 +105,7 @@
   // Add time related meta info.  Add last-modified if present.  Index date as
   // last-modified, or, if that's not present, use fetch time.
   private Document addTime(Document doc, ContentProperties metaData, String 
url,
-                           FetcherOutput fo) {
+                           CrawlDatum datum) {
     long time = -1;
 
     String lastModified = metaData.getProperty("last-modified");
@@ -114,7 +116,7 @@
     }
 
     if (time == -1) {                             // if no last-modified
-      time = fo.getFetchDate();                   // use fetch time
+      time = datum.getFetchTime();                   // use fetch time
     }
 
     // add support for query syntax date:

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
 Thu Dec 22 17:16:31 2005
@@ -17,9 +17,12 @@
 
 
 // Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.fetcher.FetcherOutput;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.parse.Parse;
 
 // Lucene imports
@@ -54,7 +57,7 @@
   }
 
   // Inherited JavaDoc
-  public Document filter(Document doc, Parse parse, FetcherOutput fo)
+  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum 
datum, Inlinks inlinks)
     throws IndexingException {
 
     //check if X-meta-lang found, possibly put there by HTMLLanguageParser

svn commit: r358674 - in /lucene/nutch/trunk/src: java/org/apache/nutch/crawl/ java/org/apache/nutch/indexer/ plugin/creativecommons/src/java/org/creativecommons/nutch/ plugin/index-basic/src/java/org/apache/nutch/indexer/basic/ plugin/index-more/src/j...

Reply via email to