Hi all:
I'm working on a custom parser plugin to generate thumbnails from images
fetched with nutch 1.4. I'm doing this because the humbnails will be converted
into a base64 encoded string and stored on a Solr backend.
So I basically wrote a custom parser (to which I send all png images, for
example). I enable the plugin (image-thumbnail) in the nutch-site.xml, set some
custom properties to load the width and height of the thumbnail. Also set the
alias on the parse-plugins.xml and set the plugin to handle the image/png
files, also in this file.
the plugin is being loaded, but every time I get a png image to parse I get
this:
Error parsing:
http://localhost/sites/all/themes/octavitos/images/iconos/audiointernet.png:
java.lang.NullPointerException
at org.apache.nutch.parse.ParserFactory.match(ParserFactory.java:388)
at
org.apache.nutch.parse.ParserFactory.getExtension(ParserFactory.java:397)
at
org.apache.nutch.parse.ParserFactory.matchExtensions(ParserFactory.java:296)
at
org.apache.nutch.parse.ParserFactory.findExtensions(ParserFactory.java:262)
at
org.apache.nutch.parse.ParserFactory.getExtensions(ParserFactory.java:234)
at
org.apache.nutch.parse.ParserFactory.getParsers(ParserFactory.java:119)
at org.apache.nutch.parse.ParseUtil.parse(ParseUtil.java:71)
at org.apache.nutch.parse.ParseSegment.map(ParseSegment.java:86)
at org.apache.nutch.parse.ParseSegment.map(ParseSegment.java:42)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:358)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:307)
at
org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:177)
The thing is that I have put some log messages inside the getParse() method but
none of this message are being logged on the hadoop.log file, so for what I can
tell the method is not being executed.
Any one has any idea what I'm doing wrong?
P.S: I've attached the source of the ImageThumbnailParser.
Greetings!
10mo. ANIVERSARIO DE LA CREACION DE LA UNIVERSIDAD DE LAS CIENCIAS
INFORMATICAS...
CONECTADOS AL FUTURO, CONECTADOS A LA REVOLUCION
http://www.uci.cu
http://www.facebook.com/universidad.uci
http://www.flickr.com/photos/universidad_uci/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.thumbnail;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import javax.imageio.ImageIO;
import net.coobird.thumbnailator.Thumbnails;
import org.apache.commons.codec.binary.Base64;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilters;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.protocol.Content;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.w3c.dom.DocumentFragment;
/**
* Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
* representation returned by Tika as SAX events *
*/
public class ImageThumbnailParser implements org.apache.nutch.parse.Parser {
public static final Logger LOG = LoggerFactory.getLogger(ImageThumbnailParser.class);
public final static String THUMBNAIL = "thumb";
public final static String CONF_PROPERTY = "thumbnails.size"; // 100x100
private int width = 0;
private int height = 0;
private Configuration conf;
public ParseResult getParse(Content content) {
String mimeType = content.getContentType();
// get the right parser using the mime type as a clue
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
/*
BufferedImage image = null;
ByteArrayOutputStream output = null;
try {
// Convert content to Image
image = ImageIO.read(new ByteArrayInputStream(content.getContent()));
// Resize the image
Thumbnails.of(image).size(this.width, this.height).outputFormat("png").toOutputStream(output);
// Convert to bytes[]
// output = new ByteArrayOutputStream();
ImageIO.write(image, "png", output);
} catch (Exception e) {
LOG.error("THUMBNAIL", ImageThumbnailParser.class.getName() + ": Can't thumbnail image from: " + content.getUrl());
return new ParseStatus(ParseStatus.FAILED,
"Can't make thumbnail. " + e).getEmptyParseResult(content.getUrl(), getConf());
}
// Convert content into base64
String encoded = Base64.encodeBase64String(output.toByteArray());
text = encoded;
nutchMetadata.add(THUMBNAIL, encoded);
*/
// no outlinks? try OutlinkExtractor e.g works for mime types where no
// explicit markup for anchors
if (outlinks.length == 0) {
outlinks = OutlinkExtractor.getOutlinks(text, getConf());
}
ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
return parseResult;
}
public void setConf(Configuration conf) {
this.conf = conf;
LOG.warn("PEPEPE");
if (this.conf != null) {
String value[] = conf.get(CONF_PROPERTY, "0x0").split("x");
this.width = Integer.parseInt(value[0]);
this.height = Integer.parseInt(value[1]);
LOG.warn("ancho: " + this.width);
LOG.warn("alto: " + this.height);
}
}
public Configuration getConf() {
return this.conf;
}
}