Github user sebastian-nagel commented on a diff in the pull request:
https://github.com/apache/nutch/pull/95#discussion_r54779042
--- Diff: src/java/org/apache/nutch/indexer/IndexerMapReduce.java ---
@@ -166,235 +145,310 @@ private String filterUrl(String url) {
return url;
}
- public void map(Text key, Writable value,
- OutputCollector<Text, NutchWritable> output, Reporter reporter)
- throws IOException {
+ /**
+ * Implementation of {@link org.apache.hadoop.mapred.Mapper}
+ * which optionally normalizes then filters a URL before simply
+ * collecting key and values with the keys being URLs (manifested
+ * as {@link org.apache.hadoop.io.Text}) and the
+ * values as {@link org.apache.nutch.crawl.NutchWritable} instances
+ * of {@link org.apache.nutch.crawl.CrawlDatum}.
+ */
+ public static class IndexerMapReduceMapper implements Mapper<Text,
Writable, Text, NutchWritable> {
+
+ @Override
+ public void configure(JobConf job) {
+ }
+
+ public void map(Text key, Writable value,
+ OutputCollector<Text, NutchWritable> output, Reporter reporter)
+ throws IOException {
+
+ String urlString = filterUrl(normalizeUrl(key.toString()));
+ if (urlString == null) {
+ return;
+ } else {
+ key.set(urlString);
+ }
+
+ output.collect(key, new NutchWritable(value));
+ }
- String urlString = filterUrl(normalizeUrl(key.toString()));
- if (urlString == null) {
- return;
- } else {
- key.set(urlString);
+ @Override
+ public void close() throws IOException {
}
- output.collect(key, new NutchWritable(value));
}
- public void reduce(Text key, Iterator<NutchWritable> values,
- OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
- throws IOException {
- Inlinks inlinks = null;
- CrawlDatum dbDatum = null;
- CrawlDatum fetchDatum = null;
- Content content = null;
- ParseData parseData = null;
- ParseText parseText = null;
-
- while (values.hasNext()) {
- final Writable value = values.next().get(); // unwrap
- if (value instanceof Inlinks) {
- inlinks = (Inlinks) value;
- } else if (value instanceof CrawlDatum) {
- final CrawlDatum datum = (CrawlDatum) value;
- if (CrawlDatum.hasDbStatus(datum)) {
- dbDatum = datum;
- } else if (CrawlDatum.hasFetchStatus(datum)) {
- // don't index unmodified (empty) pages
- if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
- fetchDatum = datum;
+ /**
+ * Implementation of {@link org.apache.hadoop.mapred.Reducer}
+ * which generates {@link org.apache.nutch.indexer.NutchIndexAction}'s
+ * from combinations of various Nutch data structures. Essentially
+ * teh result is a key representing a URL and a value representing a
--- End diff --
typo teh -> the
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---