Hi all:

I'm working on a custom parser plugin to generate thumbnails from images 
fetched with nutch 1.4. I'm doing this because the humbnails will be converted 
into a base64 encoded string and stored on a Solr backend.

So I basically wrote a custom parser (to which I send all png images, for 
example). I enable the plugin (image-thumbnail) in the nutch-site.xml, set some 
custom properties to load the width and height of the thumbnail. Also set the 
alias on the parse-plugins.xml and set the plugin to handle the image/png 
files, also in this file. 

the plugin is being loaded, but every time I get a png image to parse I get 
this:

Error parsing: 
http://localhost/sites/all/themes/octavitos/images/iconos/audiointernet.png: 
java.lang.NullPointerException
        at org.apache.nutch.parse.ParserFactory.match(ParserFactory.java:388)
        at 
org.apache.nutch.parse.ParserFactory.getExtension(ParserFactory.java:397)
        at 
org.apache.nutch.parse.ParserFactory.matchExtensions(ParserFactory.java:296)
        at 
org.apache.nutch.parse.ParserFactory.findExtensions(ParserFactory.java:262)
        at 
org.apache.nutch.parse.ParserFactory.getExtensions(ParserFactory.java:234)
        at 
org.apache.nutch.parse.ParserFactory.getParsers(ParserFactory.java:119)
        at org.apache.nutch.parse.ParseUtil.parse(ParseUtil.java:71)
        at org.apache.nutch.parse.ParseSegment.map(ParseSegment.java:86)
        at org.apache.nutch.parse.ParseSegment.map(ParseSegment.java:42)
        at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
        at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:358)
        at org.apache.hadoop.mapred.MapTask.run(MapTask.java:307)
        at 
org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:177)

The thing is that I have put some log messages inside the getParse() method but 
none of this message are being logged on the hadoop.log file, so for what I can 
tell the method is not being executed. 

Any one has any idea what I'm doing wrong?

P.S: I've attached the source of the ImageThumbnailParser.

Greetings!


10mo. ANIVERSARIO DE LA CREACION DE LA UNIVERSIDAD DE LAS CIENCIAS 
INFORMATICAS...
CONECTADOS AL FUTURO, CONECTADOS A LA REVOLUCION

http://www.uci.cu
http://www.facebook.com/universidad.uci
http://www.flickr.com/photos/universidad_uci
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.parse.thumbnail;

import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import javax.imageio.ImageIO;
import net.coobird.thumbnailator.Thumbnails;
import org.apache.commons.codec.binary.Base64;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilters;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.protocol.Content;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.w3c.dom.DocumentFragment;

/**
 * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
 * representation returned by Tika as SAX events *
 */
public class ImageThumbnailParser implements org.apache.nutch.parse.Parser {

    public static final Logger LOG = LoggerFactory.getLogger(ImageThumbnailParser.class);
    public final static String THUMBNAIL = "thumb";
    public final static String CONF_PROPERTY = "thumbnails.size"; // 100x100
    private int width = 0;
    private int height = 0;
    private Configuration conf;

    public ParseResult getParse(Content content) {
        String mimeType = content.getContentType();

        // get the right parser using the mime type as a clue
        String text = "";
        String title = "";
        Outlink[] outlinks = new Outlink[0];
        org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
/*
        BufferedImage image = null;
        ByteArrayOutputStream output = null;

        try {
            // Convert content to Image
            image = ImageIO.read(new ByteArrayInputStream(content.getContent()));

            // Resize the image
            Thumbnails.of(image).size(this.width, this.height).outputFormat("png").toOutputStream(output);

            // Convert to bytes[]
            // output = new ByteArrayOutputStream();
            ImageIO.write(image, "png", output);

        } catch (Exception e) {
            LOG.error("THUMBNAIL", ImageThumbnailParser.class.getName() + ": Can't thumbnail image from: " + content.getUrl());

            return new ParseStatus(ParseStatus.FAILED,
                    "Can't make thumbnail. " + e).getEmptyParseResult(content.getUrl(), getConf());
        }

        // Convert content into base64
        String encoded = Base64.encodeBase64String(output.toByteArray());
        text = encoded;

        nutchMetadata.add(THUMBNAIL, encoded);
*/
        // no outlinks? try OutlinkExtractor e.g works for mime types where no
        // explicit markup for anchors
        if (outlinks.length == 0) {
            outlinks = OutlinkExtractor.getOutlinks(text, getConf());
        }

        ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);

        ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
        ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));


        return parseResult;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;

        LOG.warn("PEPEPE");

        if (this.conf != null) {
            String value[] = conf.get(CONF_PROPERTY, "0x0").split("x");

            this.width = Integer.parseInt(value[0]);
            this.height = Integer.parseInt(value[1]);

            LOG.warn("ancho: " + this.width);
            LOG.warn("alto: " + this.height);
        }
    }

    public Configuration getConf() {
        return this.conf;
    }
}

Reply via email to