Author: ab
Date: Thu Dec 22 17:16:31 2005
New Revision: 358674
URL: http://svn.apache.org/viewcvs?rev=358674&view=rev
Log:
Remove traces of the old API FetcherOutput.
The old IndexSegment is now marked broken. In the next step old utilities
should be removed.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Indexer.java Thu Dec 22
17:16:31 2005
@@ -213,15 +213,8 @@
// }
try {
- // dummy up a FetcherOutput so that we can use existing indexing filters
- // TODO: modify IndexingFilter interface to use Inlinks, etc.
- FetcherOutput fo =
- new FetcherOutput(new FetchListEntry(true,new Page((UTF8)key),anchors),
- null, null);
- fo.setFetchDate(fetchDatum.getFetchTime());
-
// run indexing filters
- doc = IndexingFilters.filter(doc,new ParseImpl(parseText, parseData),fo);
+ doc = IndexingFilters.filter(doc,new ParseImpl(parseText, parseData),
(UTF8)key, fetchDatum, inlinks);
} catch (IndexingException e) {
LOG.warning("Error indexing "+key+": "+e);
return;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java Thu
Dec 22 17:16:31 2005
@@ -75,6 +75,7 @@
public void setScorePower(float power) { scorePower = power; }
public void indexPages() throws Exception {
+/*
//
// First, see if it's ever been indexed before
//
@@ -184,6 +185,7 @@
float eps = (float) count / (float) (delta / 1000);
LOG.info("DONE indexing segment " + srcDir.getName() + ": total " +
total +
" records in " + ((float) delta / 1000f) + " s (" + eps + "
rec/s).");
+*/
}
/**
@@ -229,6 +231,9 @@
* Create an index for the input files in the named directory.
*/
public static void main(String[] args) throws Exception {
+ System.err.println("ERROR: use org.apache.nutch.crawl.Indexer instead.");
+ System.exit(0);
+
String usage = "IndexSegment (-local | -ndfs <namenode:port>)
<segment_directory> [-dir <workingdir>]";
if (args.length == 0) {
System.err.println("Usage: " + usage);
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
Thu Dec 22 17:16:31 2005
@@ -18,7 +18,10 @@
import org.apache.lucene.document.Document;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.fetcher.FetcherOutput;
+import org.apache.nutch.io.UTF8;
/** Extension point for indexing. Permits one to add metadata to the indexed
* fields. All plugins found which implement this extension point are run
@@ -28,8 +31,18 @@
/** The name of the extension point. */
final static String X_POINT_ID = IndexingFilter.class.getName();
- /** Adds fields or otherwise modifies the document that will be indexed for a
- * parse. */
- Document filter(Document doc, Parse parse, FetcherOutput fo)
+ /**
+ * Adds fields or otherwise modifies the document that will be indexed for a
+ * parse.
+ *
+ * @param doc document instance for collecting fields
+ * @param parse parse data instance
+ * @param url page url
+ * @param datum crawl datum for the page
+ * @param inlinks page inlinks
+ * @return modified (or a new) document instance
+ * @throws IndexingException
+ */
+ Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum,
Inlinks inlinks)
throws IndexingException;
}
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
Thu Dec 22 17:16:31 2005
@@ -22,7 +22,10 @@
import org.apache.nutch.plugin.*;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.fetcher.FetcherOutput;
+import org.apache.nutch.io.UTF8;
/** Creates and caches [EMAIL PROTECTED] IndexingFilter} implementing
plugins.*/
public class IndexingFilters {
@@ -39,6 +42,7 @@
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
IndexingFilter filter =
(IndexingFilter)extension.getExtensionInstance();
+ System.out.println("-adding " + filter.getClass().getName());
if (!filterMap.containsKey(filter.getClass().getName())) {
filterMap.put(filter.getClass().getName(), filter);
}
@@ -52,11 +56,11 @@
private IndexingFilters() {} // no public ctor
/** Run all defined filters. */
- public static Document filter(Document doc, Parse parse, FetcherOutput fo)
+ public static Document filter(Document doc, Parse parse, UTF8 url,
CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
for (int i = 0; i < CACHE.length; i++) {
- doc = CACHE[i].filter(doc, parse, fo);
+ doc = CACHE[i].filter(doc, parse, url, datum, inlinks);
}
return doc;
Modified:
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
(original)
+++
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
Thu Dec 22 17:16:31 2005
@@ -23,7 +23,10 @@
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.pagedb.FetchListEntry;
@@ -42,16 +45,16 @@
/** The name of the document field we use. */
public static String FIELD = "cc";
- public Document filter(Document doc, Parse parse, FetcherOutput fo)
+ public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum
datum, Inlinks inlinks)
throws IndexingException {
// index the license
String licenseUrl = parse.getData().get("License-Url");
if (licenseUrl != null) {
- LOG.info("CC: indexing "+licenseUrl+" for: "+fo.getUrl());
+ LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
// add the entire license as cc:license=xxx
- addFeature(doc, "license="+licenseUrl);
+ addFeature(doc, "license=" + licenseUrl);
// index license attributes extracted of the license url
addUrlFeatures(doc, licenseUrl);
@@ -60,7 +63,7 @@
// index the license location as cc:meta=xxx
String licenseLocation = parse.getData().get("License-Location");
if (licenseLocation != null) {
- addFeature(doc, "meta="+licenseLocation);
+ addFeature(doc, "meta=" + licenseLocation);
}
// index the work type cc:type=xxx
@@ -91,7 +94,7 @@
addFeature(doc, feature);
}
} catch (MalformedURLException e) {
- LOG.warning("CC: failed to parse url: "+urlString+" : "+e);
+ LOG.warning("CC: failed to parse url: " + urlString + " : " + e);
}
}
Modified:
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
(original)
+++
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Thu Dec 22 17:16:31 2005
@@ -23,10 +23,14 @@
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.pagedb.FetchListEntry;
+import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.logging.Logger;
@@ -41,13 +45,12 @@
private static final int MAX_TITLE_LENGTH =
NutchConf.get().getInt("indexer.max.title.length", 100);
- public Document filter(Document doc, Parse parse, FetcherOutput fo)
+ public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum
datum, Inlinks inlinks)
throws IndexingException {
- String url = fo.getUrl().toString();
String host = null;
try {
- URL u = new URL(url);
+ URL u = new URL(url.toString());
host = u.getHost();
} catch (MalformedURLException e) {
throw new IndexingException(e);
@@ -62,15 +65,19 @@
// url is both stored and indexed, so it's both searchable and returned
- doc.add(Field.Text("url", url));
+ doc.add(Field.Text("url", url.toString()));
// content is indexed, so that it's searchable, but not stored in index
doc.add(Field.UnStored("content", parse.getText()));
// anchors are indexed, so they're searchable, but not stored in index
- String[] anchors = fo.getAnchors();
- for (int i = 0; i < anchors.length; i++) {
- doc.add(Field.UnStored("anchor", anchors[i]));
+ try {
+ String[] anchors = inlinks.getAnchors();
+ for (int i = 0; i < anchors.length; i++) {
+ doc.add(Field.UnStored("anchor", anchors[i]));
+ }
+ } catch (IOException ioe) {
+ LOG.warning("BasicIndexingFilter: can't get anchors for " +
url.toString());
}
// title
Modified:
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
(original)
+++
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Thu Dec 22 17:16:31 2005
@@ -33,7 +33,10 @@
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.util.NutchConf;
@@ -81,21 +84,20 @@
MimeTypes.get(NutchConf.get().get("mime.types.file"));
- public Document filter(Document doc, Parse parse, FetcherOutput fo)
+ public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum
datum, Inlinks inlinks)
throws IndexingException {
- String url = fo.getUrl().toString();
-
+ String url_s = url.toString();
// normalize metaData (see note in the method below).
ContentProperties metaData = normalizeMeta(parse.getData().getMetadata());
- addTime(doc, metaData, url, fo);
+ addTime(doc, metaData, url_s, datum);
- addLength(doc, metaData, url);
+ addLength(doc, metaData, url_s);
- addType(doc, metaData, url);
+ addType(doc, metaData, url_s);
- resetTitle(doc, metaData, url);
+ resetTitle(doc, metaData, url_s);
return doc;
}
@@ -103,7 +105,7 @@
// Add time related meta info. Add last-modified if present. Index date as
// last-modified, or, if that's not present, use fetch time.
private Document addTime(Document doc, ContentProperties metaData, String
url,
- FetcherOutput fo) {
+ CrawlDatum datum) {
long time = -1;
String lastModified = metaData.getProperty("last-modified");
@@ -114,7 +116,7 @@
}
if (time == -1) { // if no last-modified
- time = fo.getFetchDate(); // use fetch time
+ time = datum.getFetchTime(); // use fetch time
}
// add support for query syntax date:
Modified:
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=358674&r1=358673&r2=358674&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
(original)
+++
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
Thu Dec 22 17:16:31 2005
@@ -17,9 +17,12 @@
// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.io.UTF8;
import org.apache.nutch.parse.Parse;
// Lucene imports
@@ -54,7 +57,7 @@
}
// Inherited JavaDoc
- public Document filter(Document doc, Parse parse, FetcherOutput fo)
+ public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum
datum, Inlinks inlinks)
throws IndexingException {
//check if X-meta-lang found, possibly put there by HTMLLanguageParser