[GitHub] nutch pull request: NUTCH-2184 Enable IndexingJob to function with...

sebastian-nagel Wed, 02 Mar 2016 12:04:34 -0800

Github user sebastian-nagel commented on a diff in the pull request:

    https://github.com/apache/nutch/pull/95#discussion_r54782218
  
    --- Diff: src/java/org/apache/nutch/indexer/IndexerMapReduce.java ---
    @@ -166,235 +145,310 @@ private String filterUrl(String url) {
         return url;
       }
     
    -  public void map(Text key, Writable value,
    -      OutputCollector<Text, NutchWritable> output, Reporter reporter)
    -          throws IOException {
    +  /**
    +   * Implementation of {@link org.apache.hadoop.mapred.Mapper}
    +   * which optionally normalizes then filters a URL before simply
    +   * collecting key and values with the keys being URLs (manifested
    +   * as {@link org.apache.hadoop.io.Text}) and the 
    +   * values as {@link org.apache.nutch.crawl.NutchWritable} instances
    +   * of {@link org.apache.nutch.crawl.CrawlDatum}.
    +   */
    +  public static class IndexerMapReduceMapper implements Mapper<Text, 
Writable, Text, NutchWritable> {
    +
    +    @Override
    +    public void configure(JobConf job) {
    +    }
    +
    +    public void map(Text key, Writable value,
    +        OutputCollector<Text, NutchWritable> output, Reporter reporter)
    +            throws IOException {
    +
    +      String urlString = filterUrl(normalizeUrl(key.toString()));
    +      if (urlString == null) {
    +        return;
    +      } else {
    +        key.set(urlString);
    +      }
    +
    +      output.collect(key, new NutchWritable(value));
    +    }
     
    -    String urlString = filterUrl(normalizeUrl(key.toString()));
    -    if (urlString == null) {
    -      return;
    -    } else {
    -      key.set(urlString);
    +    @Override
    +    public void close() throws IOException {
         }
     
    -    output.collect(key, new NutchWritable(value));
       }
     
    -  public void reduce(Text key, Iterator<NutchWritable> values,
    -      OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
    -          throws IOException {
    -    Inlinks inlinks = null;
    -    CrawlDatum dbDatum = null;
    -    CrawlDatum fetchDatum = null;
    -    Content content = null;
    -    ParseData parseData = null;
    -    ParseText parseText = null;
    -
    -    while (values.hasNext()) {
    -      final Writable value = values.next().get(); // unwrap
    -      if (value instanceof Inlinks) {
    -        inlinks = (Inlinks) value;
    -      } else if (value instanceof CrawlDatum) {
    -        final CrawlDatum datum = (CrawlDatum) value;
    -        if (CrawlDatum.hasDbStatus(datum)) {
    -          dbDatum = datum;
    -        } else if (CrawlDatum.hasFetchStatus(datum)) {
    -          // don't index unmodified (empty) pages
    -          if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
    -            fetchDatum = datum;
    +  /**
    +   * Implementation of {@link org.apache.hadoop.mapred.Reducer}
    +   * which generates {@link org.apache.nutch.indexer.NutchIndexAction}'s
    +   * from combinations of various Nutch data structures. Essentially 
    +   * teh result is a key representing a URL and a value representing a
    +   * unit of indexing holding the document and action information.
    +   */
    +  public static class IndexerMapReduceReducer implements Reducer<Text, 
NutchWritable, Text, NutchIndexAction> {
    +
    +    private boolean skip = false;
    +    private boolean delete = false;
    +    private boolean deleteRobotsNoIndex = false;
    +    private boolean deleteSkippedByIndexingFilter = false;
    +    private boolean base64 = false;
    +    private IndexingFilters filters;
    +    private ScoringFilters scfilters;
    +
    +    @Override
    +    public void configure(JobConf job) {
    +      Configuration conf = NutchConfiguration.create();
    --- End diff --
    
    JobConf extends Configuration, there should be no need to create a new 
Configuration object.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

[GitHub] nutch pull request: NUTCH-2184 Enable IndexingJob to function with...

Reply via email to