http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java new file mode 100644 index 0000000..daf96e0 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java @@ -0,0 +1,278 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse; + +// JDK imports +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; + +// Commons Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.util.NutchConfiguration; + +/** + * A reader to load the information stored in the + * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file. + * + * @author mattmann + * @version 1.0 + */ +class ParsePluginsReader { + + /* our log stream */ + public static final Logger LOG = LoggerFactory + .getLogger(ParsePluginsReader.class); + + /** The property name of the parse-plugins location */ + private static final String PP_FILE_PROP = "parse.plugin.file"; + + /** the parse-plugins file */ + private String fParsePluginsFile = null; + + /** + * Constructs a new ParsePluginsReader + */ + public ParsePluginsReader() { + } + + /** + * Reads the <code>parse-plugins.xml</code> file and returns the + * {@link #ParsePluginList} defined by it. + * + * @return A {@link #ParsePluginList} specified by the + * <code>parse-plugins.xml</code> file. + * @throws Exception + * If any parsing error occurs. + */ + public ParsePluginList parse(Configuration conf) { + + ParsePluginList pList = new ParsePluginList(); + + // open up the XML file + DocumentBuilderFactory factory = null; + DocumentBuilder parser = null; + Document document = null; + InputSource inputSource = null; + + InputStream ppInputStream = null; + if (fParsePluginsFile != null) { + URL parsePluginUrl = null; + try { + parsePluginUrl = new URL(fParsePluginsFile); + ppInputStream = parsePluginUrl.openStream(); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Unable to load parse plugins file from URL " + "[" + + fParsePluginsFile + "]. Reason is [" + e + "]"); + } + return pList; + } + } else { + ppInputStream = conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP)); + } + + inputSource = new InputSource(ppInputStream); + + try { + factory = DocumentBuilderFactory.newInstance(); + parser = factory.newDocumentBuilder(); + document = parser.parse(inputSource); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is [" + + e + "]"); + } + return null; + } + + Element parsePlugins = document.getDocumentElement(); + + // build up the alias hash map + Map<String, String> aliases = getAliases(parsePlugins); + // And store it on the parse plugin list + pList.setAliases(aliases); + + // get all the mime type nodes + NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType"); + + // iterate through the mime types + for (int i = 0; i < mimeTypes.getLength(); i++) { + Element mimeType = (Element) mimeTypes.item(i); + String mimeTypeStr = mimeType.getAttribute("name"); + + // for each mimeType, get the plugin list + NodeList pluginList = mimeType.getElementsByTagName("plugin"); + + // iterate through the plugins, add them in order read + // OR if they have a special order="" attribute, then hold those in + // a separate list, and then insert them into the final list at the + // order specified + if (pluginList != null && pluginList.getLength() > 0) { + List<String> plugList = new ArrayList<String>(pluginList.getLength()); + + for (int j = 0; j < pluginList.getLength(); j++) { + Element plugin = (Element) pluginList.item(j); + String pluginId = plugin.getAttribute("id"); + String extId = aliases.get(pluginId); + if (extId == null) { + // Assume an extension id is directly specified + extId = pluginId; + } + String orderStr = plugin.getAttribute("order"); + int order = -1; + try { + order = Integer.parseInt(orderStr); + } catch (NumberFormatException ignore) { + } + if (order != -1) { + plugList.add(order - 1, extId); + } else { + plugList.add(extId); + } + } + + // now add the plugin list and map it to this mimeType + pList.setPluginList(mimeTypeStr, plugList); + + } else if (LOG.isWarnEnabled()) { + LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: " + + mimeTypeStr + ", continuing parse"); + } + } + return pList; + } + + /** + * Tests parsing of the parse-plugins.xml file. An alternative name for the + * file can be specified via the <code>--file</code> option, although the file + * must be located in the <code>$NUTCH_HOME/conf</code> directory. + * + * @param args + * Currently only the --file argument to specify an alternative name + * for the parse-plugins.xml file is supported. + */ + public static void main(String[] args) throws Exception { + String parsePluginFile = null; + String usage = "ParsePluginsReader [--file <parse plugin file location>]"; + + if ((args.length != 0 && args.length != 2) + || (args.length == 2 && !"--file".equals(args[0]))) { + System.err.println(usage); + System.exit(1); + } + + for (int i = 0; i < args.length; i++) { + if (args[i].equals("--file")) { + parsePluginFile = args[++i]; + } + } + + ParsePluginsReader reader = new ParsePluginsReader(); + + if (parsePluginFile != null) { + reader.setFParsePluginsFile(parsePluginFile); + } + + ParsePluginList prefs = reader.parse(NutchConfiguration.create()); + + for (String mimeType : prefs.getSupportedMimeTypes()) { + + System.out.println("MIMETYPE: " + mimeType); + List<String> plugList = prefs.getPluginList(mimeType); + + System.out.println("EXTENSION IDs:"); + + for (String j : plugList) { + System.out.println(j); + } + } + + } + + /** + * @return Returns the fParsePluginsFile. + */ + public String getFParsePluginsFile() { + return fParsePluginsFile; + } + + /** + * @param parsePluginsFile + * The fParsePluginsFile to set. + */ + public void setFParsePluginsFile(String parsePluginsFile) { + fParsePluginsFile = parsePluginsFile; + } + + private Map<String, String> getAliases(Element parsePluginsRoot) { + + Map<String, String> aliases = new HashMap<String, String>(); + NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases"); + + if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 0)) { + if (LOG.isWarnEnabled()) { + LOG.warn("No aliases defined in parse-plugins.xml!"); + } + return aliases; + } + + if (aliasRoot.getLength() > 1) { + // log a warning, but try and continue processing + if (LOG.isWarnEnabled()) { + LOG.warn("There should only be one \"aliases\" tag in parse-plugins.xml"); + } + } + + Element aliasRootElem = (Element) aliasRoot.item(0); + NodeList aliasElements = aliasRootElem.getElementsByTagName("alias"); + + if (aliasElements != null && aliasElements.getLength() > 0) { + for (int i = 0; i < aliasElements.getLength(); i++) { + Element aliasElem = (Element) aliasElements.item(i); + String parsePluginId = aliasElem.getAttribute("name"); + String extensionId = aliasElem.getAttribute("extension-id"); + if (LOG.isTraceEnabled()) { + LOG.trace("Found alias: plugin-id: " + parsePluginId + + ", extension-id: " + extensionId); + } + if (parsePluginId != null && extensionId != null) { + aliases.put(parsePluginId, extensionId); + } + } + } + return aliases; + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java new file mode 100644 index 0000000..92d8871 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java @@ -0,0 +1,178 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.io.Text; + +/** + * A utility class that stores result of a parse. Internally a ParseResult + * stores <{@link Text}, {@link Parse}> pairs. + * <p> + * Parsers may return multiple results, which correspond to parts or other + * associated documents related to the original URL. + * </p> + * <p> + * There will be usually one parse result that corresponds directly to the + * original URL, and possibly many (or none) results that correspond to derived + * URLs (or sub-URLs). + */ +public class ParseResult implements Iterable<Map.Entry<Text, Parse>> { + private Map<Text, Parse> parseMap; + private String originalUrl; + + public static final Logger LOG = LoggerFactory.getLogger(ParseResult.class); + + /** + * Create a container for parse results. + * + * @param originalUrl + * the original url from which all parse results have been obtained. + */ + public ParseResult(String originalUrl) { + parseMap = new HashMap<Text, Parse>(); + this.originalUrl = originalUrl; + } + + /** + * Convenience method for obtaining {@link ParseResult} from a single + * <code>Parse</code> output. + * + * @param url + * canonical url. + * @param parse + * single parse output. + * @return result containing the single parse output. + */ + public static ParseResult createParseResult(String url, Parse parse) { + ParseResult parseResult = new ParseResult(url); + parseResult.put(new Text(url), new ParseText(parse.getText()), + parse.getData()); + return parseResult; + } + + /** + * Checks whether the result is empty. + * + * @return + */ + public boolean isEmpty() { + return parseMap.isEmpty(); + } + + /** + * Return the number of parse outputs (both successful and failed) + */ + public int size() { + return parseMap.size(); + } + + /** + * Retrieve a single parse output. + * + * @param key + * sub-url under which the parse output is stored. + * @return parse output corresponding to this sub-url, or null. + */ + public Parse get(String key) { + return get(new Text(key)); + } + + /** + * Retrieve a single parse output. + * + * @param key + * sub-url under which the parse output is stored. + * @return parse output corresponding to this sub-url, or null. + */ + public Parse get(Text key) { + return parseMap.get(key); + } + + /** + * Store a result of parsing. + * + * @param key + * URL or sub-url of this parse result + * @param text + * plain text result + * @param data + * corresponding parse metadata of this result + */ + public void put(Text key, ParseText text, ParseData data) { + put(key.toString(), text, data); + } + + /** + * Store a result of parsing. + * + * @param key + * URL or sub-url of this parse result + * @param text + * plain text result + * @param data + * corresponding parse metadata of this result + */ + public void put(String key, ParseText text, ParseData data) { + parseMap.put(new Text(key), + new ParseImpl(text, data, key.equals(originalUrl))); + } + + /** + * Iterate over all entries in the <url, Parse> map. + */ + public Iterator<Entry<Text, Parse>> iterator() { + return parseMap.entrySet().iterator(); + } + + /** + * Remove all results where status is not successful (as determined by + * </code>ParseStatus#isSuccess()</code>). Note that effects of this operation + * cannot be reversed. + */ + public void filter() { + for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) { + Entry<Text, Parse> entry = i.next(); + if (!entry.getValue().getData().getStatus().isSuccess()) { + LOG.warn(entry.getKey() + " is not parsed successfully, filtering"); + i.remove(); + } + } + + } + + /** + * A convenience method which returns true only if all parses are successful. + * Parse success is determined by <code>ParseStatus#isSuccess()</code>. + */ + public boolean isSuccess() { + for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) { + Entry<Text, Parse> entry = i.next(); + if (!entry.getValue().getData().getStatus().isSuccess()) { + return false; + } + } + return true; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java new file mode 100644 index 0000000..b008bed --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java @@ -0,0 +1,309 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.SignatureFactory; +import org.apache.nutch.segment.SegmentChecker; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.io.*; +import org.apache.hadoop.mapred.*; +import org.apache.hadoop.util.*; +import org.apache.hadoop.conf.*; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.*; +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.*; +import org.apache.hadoop.fs.Path; + +import java.io.*; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.Map.Entry; + +/* Parse content in a segment. */ +public class ParseSegment extends NutchTool implements Tool, + Mapper<WritableComparable<?>, Content, Text, ParseImpl>, + Reducer<Text, Writable, Text, Writable> { + + public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class); + + public static final String SKIP_TRUNCATED = "parser.skip.truncated"; + + private ScoringFilters scfilters; + + private ParseUtil parseUtil; + + private boolean skipTruncated; + + public ParseSegment() { + this(null); + } + + public ParseSegment(Configuration conf) { + super(conf); + } + + public void configure(JobConf job) { + setConf(job); + this.scfilters = new ScoringFilters(job); + skipTruncated = job.getBoolean(SKIP_TRUNCATED, true); + } + + public void close() { + } + + private Text newKey = new Text(); + + public void map(WritableComparable<?> key, Content content, + OutputCollector<Text, ParseImpl> output, Reporter reporter) + throws IOException { + // convert on the fly from old UTF8 keys + if (key instanceof Text) { + newKey.set(key.toString()); + key = newKey; + } + + int status = Integer.parseInt(content.getMetadata().get( + Nutch.FETCH_STATUS_KEY)); + if (status != CrawlDatum.STATUS_FETCH_SUCCESS) { + // content not fetched successfully, skip document + LOG.debug("Skipping " + key + " as content is not fetched successfully"); + return; + } + + if (skipTruncated && isTruncated(content)) { + return; + } + + long start = System.currentTimeMillis(); + ParseResult parseResult = null; + try { + if (parseUtil == null) + parseUtil = new ParseUtil(getConf()); + parseResult = parseUtil.parse(content); + } catch (Exception e) { + LOG.warn("Error parsing: " + key + ": " + + StringUtils.stringifyException(e)); + return; + } + + for (Entry<Text, Parse> entry : parseResult) { + Text url = entry.getKey(); + Parse parse = entry.getValue(); + ParseStatus parseStatus = parse.getData().getStatus(); + + reporter.incrCounter("ParserStatus", + ParseStatus.majorCodes[parseStatus.getMajorCode()], 1); + + if (!parseStatus.isSuccess()) { + LOG.warn("Error parsing: " + key + ": " + parseStatus); + parse = parseStatus.getEmptyParse(getConf()); + } + + // pass segment name to parse data + parse.getData().getContentMeta() + .set(Nutch.SEGMENT_NAME_KEY, getConf().get(Nutch.SEGMENT_NAME_KEY)); + + // compute the new signature + byte[] signature = SignatureFactory.getSignature(getConf()).calculate( + content, parse); + parse.getData().getContentMeta() + .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); + + try { + scfilters.passScoreAfterParsing(url, content, parse); + } catch (ScoringFilterException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Error passing score: " + url + ": " + e.getMessage()); + } + } + + long end = System.currentTimeMillis(); + LOG.info("Parsed (" + Long.toString(end - start) + "ms):" + url); + + output.collect( + url, + new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse + .isCanonical())); + } + } + + /** + * Checks if the page's content is truncated. + * + * @param content + * @return If the page is truncated <code>true</code>. When it is not, or when + * it could be determined, <code>false</code>. + */ + public static boolean isTruncated(Content content) { + byte[] contentBytes = content.getContent(); + if (contentBytes == null) + return false; + Metadata metadata = content.getMetadata(); + if (metadata == null) + return false; + + String lengthStr = metadata.get(Response.CONTENT_LENGTH); + if (lengthStr != null) + lengthStr = lengthStr.trim(); + if (StringUtil.isEmpty(lengthStr)) { + return false; + } + int inHeaderSize; + String url = content.getUrl(); + try { + inHeaderSize = Integer.parseInt(lengthStr); + } catch (NumberFormatException e) { + LOG.warn("Wrong contentlength format for " + url, e); + return false; + } + int actualSize = contentBytes.length; + if (inHeaderSize > actualSize) { + LOG.info(url + " skipped. Content of size " + inHeaderSize + + " was truncated to " + actualSize); + return true; + } + if (LOG.isDebugEnabled()) { + LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + + inHeaderSize); + } + return false; + } + + public void reduce(Text key, Iterator<Writable> values, + OutputCollector<Text, Writable> output, Reporter reporter) + throws IOException { + output.collect(key, values.next()); // collect first value + } + + public void parse(Path segment) throws IOException { + if (SegmentChecker.isParsed(segment, FileSystem.get(getConf()))) { + LOG.warn("Segment: " + segment + + " already parsed!! Skipped parsing this segment!!"); // NUTCH-1854 + return; + } + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + long start = System.currentTimeMillis(); + if (LOG.isInfoEnabled()) { + LOG.info("ParseSegment: starting at " + sdf.format(start)); + LOG.info("ParseSegment: segment: " + segment); + } + + JobConf job = new NutchJob(getConf()); + job.setJobName("parse " + segment); + + FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); + job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); + job.setInputFormat(SequenceFileInputFormat.class); + job.setMapperClass(ParseSegment.class); + job.setReducerClass(ParseSegment.class); + + FileOutputFormat.setOutputPath(job, segment); + job.setOutputFormat(ParseOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(ParseImpl.class); + + JobClient.runJob(job); + long end = System.currentTimeMillis(); + LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(), + args); + System.exit(res); + } + + public int run(String[] args) throws Exception { + Path segment; + + String usage = "Usage: ParseSegment segment [-noFilter] [-noNormalize]"; + + if (args.length == 0) { + System.err.println(usage); + System.exit(-1); + } + + if (args.length > 1) { + for (int i = 1; i < args.length; i++) { + String param = args[i]; + + if ("-nofilter".equalsIgnoreCase(param)) { + getConf().setBoolean("parse.filter.urls", false); + } else if ("-nonormalize".equalsIgnoreCase(param)) { + getConf().setBoolean("parse.normalize.urls", false); + } + } + } + + segment = new Path(args[0]); + parse(segment); + return 0; + } + + /* + * Used for Nutch REST service + */ + public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception { + + Map<String, Object> results = new HashMap<String, Object>(); + Path segment; + if(args.containsKey(Nutch.ARG_SEGMENT)) { + Object seg = args.get(Nutch.ARG_SEGMENT); + if(seg instanceof Path) { + segment = (Path) seg; + } + else { + segment = new Path(seg.toString()); + } + } + else { + String segment_dir = crawlId+"/segments"; + File segmentsDir = new File(segment_dir); + File[] segmentsList = segmentsDir.listFiles(); + Arrays.sort(segmentsList, new Comparator<File>(){ + @Override + public int compare(File f1, File f2) { + if(f1.lastModified()>f2.lastModified()) + return -1; + else + return 0; + } + }); + segment = new Path(segmentsList[0].getPath()); + } + + if (args.containsKey("nofilter")) { + getConf().setBoolean("parse.filter.urls", false); + } + if (args.containsKey("nonormalize")) { + getConf().setBoolean("parse.normalize.urls", false); + } + parse(segment); + results.put(Nutch.VAL_RESULT, Integer.toString(0)); + return results; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java new file mode 100644 index 0000000..b9d5959 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java @@ -0,0 +1,311 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Created on Apr 28, 2005 + * Author: Andrzej Bialecki <[email protected]> + * + */ +package org.apache.nutch.parse; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.VersionMismatchException; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.metadata.Metadata; + +/** + * @author Andrzej Bialecki <[email protected]> + */ +public class ParseStatus implements Writable { + + private final static byte VERSION = 2; + + // Primary status codes: + + /** Parsing was not performed. */ + public static final byte NOTPARSED = 0; + /** Parsing succeeded. */ + public static final byte SUCCESS = 1; + /** General failure. There may be a more specific error message in arguments. */ + public static final byte FAILED = 2; + + public static final String[] majorCodes = { "notparsed", "success", "failed" }; + + // Secondary success codes go here: + + /** + * Parsed content contains a directive to redirect to another URL. The target + * URL can be retrieved from the arguments. + */ + public static final short SUCCESS_REDIRECT = 100; + + // Secondary failure codes go here: + + /** + * Parsing failed. An Exception occured (which may be retrieved from the + * arguments). + */ + public static final short FAILED_EXCEPTION = 200; + /** + * Parsing failed. Content was truncated, but the parser cannot handle + * incomplete content. + */ + public static final short FAILED_TRUNCATED = 202; + /** + * Parsing failed. Invalid format - the content may be corrupted or of wrong + * type. + */ + public static final short FAILED_INVALID_FORMAT = 203; + /** + * Parsing failed. Other related parts of the content are needed to complete + * parsing. The list of URLs to missing parts may be provided in arguments. + * The Fetcher may decide to fetch these parts at once, then put them into + * Content.metadata, and supply them for re-parsing. + */ + public static final short FAILED_MISSING_PARTS = 204; + /** + * Parsing failed. There was no content to be parsed - probably caused by + * errors at protocol stage. + */ + public static final short FAILED_MISSING_CONTENT = 205; + + public static final ParseStatus STATUS_NOTPARSED = new ParseStatus(NOTPARSED); + public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS); + public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED); + + private byte majorCode = 0; + private short minorCode = 0; + private String[] args = null; + + public byte getVersion() { + return VERSION; + } + + public ParseStatus() { + + } + + public ParseStatus(int majorCode, int minorCode, String[] args) { + this.args = args; + this.majorCode = (byte) majorCode; + this.minorCode = (short) minorCode; + } + + public ParseStatus(int majorCode) { + this(majorCode, 0, (String[]) null); + } + + public ParseStatus(int majorCode, String[] args) { + this(majorCode, 0, args); + } + + public ParseStatus(int majorCode, int minorCode) { + this(majorCode, minorCode, (String[]) null); + } + + /** Simplified constructor for passing just a text message. */ + public ParseStatus(int majorCode, int minorCode, String message) { + this(majorCode, minorCode, new String[] { message }); + } + + /** Simplified constructor for passing just a text message. */ + public ParseStatus(int majorCode, String message) { + this(majorCode, 0, new String[] { message }); + } + + public ParseStatus(Throwable t) { + this(FAILED, FAILED_EXCEPTION, new String[] { t.toString() }); + } + + public static ParseStatus read(DataInput in) throws IOException { + ParseStatus res = new ParseStatus(); + res.readFields(in); + return res; + } + + public void readFields(DataInput in) throws IOException { + byte version = in.readByte(); + switch (version) { + case 1: + majorCode = in.readByte(); + minorCode = in.readShort(); + args = WritableUtils.readCompressedStringArray(in); + break; + case 2: + majorCode = in.readByte(); + minorCode = in.readShort(); + args = WritableUtils.readStringArray(in); + break; + default: + throw new VersionMismatchException(VERSION, version); + } + } + + public void write(DataOutput out) throws IOException { + out.writeByte(VERSION); + out.writeByte(majorCode); + out.writeShort(minorCode); + if (args == null) { + out.writeInt(-1); + } else { + WritableUtils.writeStringArray(out, args); + } + } + + /** + * A convenience method. Returns true if majorCode is SUCCESS, false + * otherwise. + */ + + public boolean isSuccess() { + return majorCode == SUCCESS; + } + + /** + * A convenience method. Return a String representation of the first argument, + * or null. + */ + public String getMessage() { + if (args != null && args.length > 0 && args[0] != null) + return args[0]; + return null; + } + + public String[] getArgs() { + return args; + } + + public int getMajorCode() { + return majorCode; + } + + public int getMinorCode() { + return minorCode; + } + + /** + * A convenience method. Creates an empty Parse instance, which returns this + * status. + */ + public Parse getEmptyParse(Configuration conf) { + return new EmptyParseImpl(this, conf); + } + + /** + * A convenience method. Creates an empty ParseResult, which contains this + * status. + */ + public ParseResult getEmptyParseResult(String url, Configuration conf) { + return ParseResult.createParseResult(url, getEmptyParse(conf)); + } + + public String toString() { + StringBuffer res = new StringBuffer(); + String name = null; + if (majorCode >= 0 && majorCode < majorCodes.length) + name = majorCodes[majorCode]; + else + name = "UNKNOWN!"; + res.append(name + "(" + majorCode + "," + minorCode + ")"); + if (args != null) { + if (args.length == 1) { + res.append(": " + String.valueOf(args[0])); + } else { + for (int i = 0; i < args.length; i++) { + if (args[i] != null) + res.append(", args[" + i + "]=" + String.valueOf(args[i])); + } + } + } + return res.toString(); + } + + public void setArgs(String[] args) { + this.args = args; + } + + public void setMessage(String msg) { + if (args == null || args.length == 0) { + args = new String[1]; + } + args[0] = msg; + } + + public void setMajorCode(byte majorCode) { + this.majorCode = majorCode; + } + + public void setMinorCode(short minorCode) { + this.minorCode = minorCode; + } + + public boolean equals(Object o) { + if (o == null) + return false; + if (!(o instanceof ParseStatus)) + return false; + boolean res = true; + ParseStatus other = (ParseStatus) o; + res = res && (this.majorCode == other.majorCode) + && (this.minorCode == other.minorCode); + if (!res) + return res; + if (this.args == null) { + if (other.args == null) + return true; + else + return false; + } else { + if (other.args == null) + return false; + if (other.args.length != this.args.length) + return false; + for (int i = 0; i < this.args.length; i++) { + if (!this.args[i].equals(other.args[i])) + return false; + } + } + return true; + } + + private static class EmptyParseImpl implements Parse { + + private ParseData data = null; + + public EmptyParseImpl(ParseStatus status, Configuration conf) { + data = new ParseData(status, "", new Outlink[0], new Metadata(), + new Metadata()); + } + + public ParseData getData() { + return data; + } + + public String getText() { + return ""; + } + + public boolean isCanonical() { + return true; + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java new file mode 100644 index 0000000..13416cf --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.io.*; +import org.apache.hadoop.io.*; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.conf.*; +import org.apache.commons.cli.Options; +import org.apache.nutch.util.NutchConfiguration; + +/* The text conversion of page's content, stored using gzip compression. + * @see Parse#getText() + */ +public final class ParseText implements Writable { + public static final String DIR_NAME = "parse_text"; + + private final static byte VERSION = 2; + + public ParseText() { + } + + private String text; + + public ParseText(String text) { + this.text = text; + } + + public void readFields(DataInput in) throws IOException { + byte version = in.readByte(); + switch (version) { + case 1: + text = WritableUtils.readCompressedString(in); + break; + case VERSION: + text = Text.readString(in); + break; + default: + throw new VersionMismatchException(VERSION, version); + } + } + + public final void write(DataOutput out) throws IOException { + out.write(VERSION); + Text.writeString(out, text); + } + + public final static ParseText read(DataInput in) throws IOException { + ParseText parseText = new ParseText(); + parseText.readFields(in); + return parseText; + } + + // + // Accessor methods + // + public String getText() { + return text; + } + + public boolean equals(Object o) { + if (!(o instanceof ParseText)) + return false; + ParseText other = (ParseText) o; + return this.text.equals(other.text); + } + + public String toString() { + return text; + } + + public static void main(String argv[]) throws Exception { + String usage = "ParseText (-local | -dfs <namenode:port>) recno segment"; + + if (argv.length < 3) { + System.out.println("usage:" + usage); + return; + } + Options opts = new Options(); + Configuration conf = NutchConfiguration.create(); + + GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv); + + String[] remainingArgs = parser.getRemainingArgs(); + + FileSystem fs = FileSystem.get(conf); + try { + int recno = Integer.parseInt(remainingArgs[0]); + String segment = remainingArgs[1]; + String filename = new Path(segment, ParseText.DIR_NAME).toString(); + + ParseText parseText = new ParseText(); + ArrayFile.Reader parseTexts = new ArrayFile.Reader(fs, filename, conf); + + parseTexts.get(recno, parseText); + System.out.println("Retrieved " + recno + " from file " + filename); + System.out.println(parseText); + parseTexts.close(); + } finally { + fs.close(); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java new file mode 100644 index 0000000..39024dc --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java @@ -0,0 +1,181 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse; + +// Commons Logging imports + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.protocol.Content; + +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +/** + * A Utility class containing methods to simply perform parsing utilities such + * as iterating through a preferred list of {@link Parser}s to obtain + * {@link Parse} objects. + * + * @author mattmann + * @author Jérôme Charron + * @author Sébastien Le Callonnec + */ +public class ParseUtil { + + /* our log stream */ + public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class); + private ParserFactory parserFactory; + /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/ + private int maxParseTime = 30; + private ExecutorService executorService; + + /** + * + * @param conf + */ + public ParseUtil(Configuration conf) { + this.parserFactory = new ParserFactory(conf); + maxParseTime = conf.getInt("parser.timeout", 30); + executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder() + .setNameFormat("parse-%d").setDaemon(true).build()); + } + + /** + * Performs a parse by iterating through a List of preferred {@link Parser}s + * until a successful parse is performed and a {@link Parse} object is + * returned. If the parse is unsuccessful, a message is logged to the + * <code>WARNING</code> level, and an empty parse is returned. + * + * @param content + * The content to try and parse. + * @return <key, {@link Parse}> pairs. + * @throws ParseException + * If no suitable parser is found to perform the parse. + */ + public ParseResult parse(Content content) throws ParseException { + Parser[] parsers = null; + + try { + parsers = this.parserFactory.getParsers(content.getContentType(), + content.getUrl() != null ? content.getUrl() : ""); + } catch (ParserNotFound e) { + if (LOG.isWarnEnabled()) { + LOG.warn("No suitable parser found when trying to parse content " + + content.getUrl() + " of type " + content.getContentType()); + } + throw new ParseException(e.getMessage()); + } + + ParseResult parseResult = null; + for (int i = 0; i < parsers.length; i++) { + if (LOG.isDebugEnabled()) { + LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + + "]"); + } + if (maxParseTime != -1) + parseResult = runParser(parsers[i], content); + else + parseResult = parsers[i].getParse(content); + + if (parseResult != null && !parseResult.isEmpty()) + return parseResult; + } + + if (LOG.isWarnEnabled()) { + LOG.warn("Unable to successfully parse content " + content.getUrl() + + " of type " + content.getContentType()); + } + return new ParseStatus(new ParseException( + "Unable to successfully parse content")).getEmptyParseResult( + content.getUrl(), null); + } + + /** + * Method parses a {@link Content} object using the {@link Parser} specified + * by the parameter <code>extId</code>, i.e., the Parser's extension ID. If a + * suitable {@link Parser} is not found, then a <code>WARNING</code> level + * message is logged, and a ParseException is thrown. If the parse is + * uncessful for any other reason, then a <code>WARNING</code> level message + * is logged, and a <code>ParseStatus.getEmptyParse()</code> is returned. + * + * @param extId + * The extension implementation ID of the {@link Parser} to use to + * parse the specified content. + * @param content + * The content to parse. + * + * @return <key, {@link Parse}> pairs if the parse is successful, + * otherwise, a single <key, + * <code>ParseStatus.getEmptyParse()</code>> pair. + * + * @throws ParseException + * If there is no suitable {@link Parser} found to perform the + * parse. + */ + public ParseResult parseByExtensionId(String extId, Content content) + throws ParseException { + Parser p = null; + + try { + p = this.parserFactory.getParserById(extId); + } catch (ParserNotFound e) { + if (LOG.isWarnEnabled()) { + LOG.warn("No suitable parser found when trying to parse content " + + content.getUrl() + " of type " + content.getContentType()); + } + throw new ParseException(e.getMessage()); + } + + ParseResult parseResult = null; + if (maxParseTime != -1) + parseResult = runParser(p, content); + else + parseResult = p.getParse(content); + if (parseResult != null && !parseResult.isEmpty()) { + return parseResult; + } else { + if (LOG.isWarnEnabled()) { + LOG.warn("Unable to successfully parse content " + content.getUrl() + + " of type " + content.getContentType()); + } + return new ParseStatus(new ParseException( + "Unable to successfully parse content")).getEmptyParseResult( + content.getUrl(), null); + } + } + + private ParseResult runParser(Parser p, Content content) { + ParseCallable pc = new ParseCallable(p, content); + Future<ParseResult> task = executorService.submit(pc); + ParseResult res = null; + try { + res = task.get(maxParseTime, TimeUnit.SECONDS); + } catch (Exception e) { + LOG.warn("Error parsing " + content.getUrl() + " with " + p, e); + task.cancel(true); + } finally { + pc = null; + } + return res; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java b/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java new file mode 100644 index 0000000..d101453 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +// Hadoop imports +import org.apache.hadoop.conf.Configurable; + +// Nutch imports +import org.apache.nutch.plugin.Pluggable; +import org.apache.nutch.protocol.Content; + +/** + * A parser for content generated by a + * {@link org.apache.nutch.protocol.Protocol} implementation. This interface is + * implemented by extensions. Nutch's core contains no page parsing code. + */ +public interface Parser extends Pluggable, Configurable { + /** The name of the extension point. */ + public final static String X_POINT_ID = Parser.class.getName(); + + /** + * <p> + * This method parses the given content and returns a map of <key, + * parse> pairs. {@link Parse} instances will be persisted under the given + * key. + * </p> + * <p> + * Note: Meta-redirects should be followed only when they are coming from the + * original URL. That is: <br> + * Assume fetcher is in parsing mode and is currently processing + * foo.bar.com/redirect.html. If this url contains a meta redirect to another + * url, fetcher should only follow the redirect if the map contains an entry + * of the form <"foo.bar.com/redirect.html", {@link Parse} with a + * {@link ParseStatus} indicating the redirect>. + * </p> + * + * @param c + * Content to be parsed + * @return a map containing <key, parse> pairs + * @since NUTCH-443 + */ + ParseResult getParse(Content c); +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java new file mode 100644 index 0000000..7e5b146 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java @@ -0,0 +1,270 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.SignatureFactory; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.URLUtil; +import org.apache.nutch.util.StringUtil; + +/** + * Parser checker, useful for testing parser. It also accurately reports + * possible fetching and parsing failures and presents protocol status signals + * to aid debugging. The tool enables us to retrieve the following data from any + * url: + * <ol> + * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content} + * type.</li> + * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and + * is used to remove duplicates during the dedup procedure. It is calculated + * using {@link org.apache.nutch.crawl.MD5Signature} or + * {@link org.apache.nutch.crawl.TextProfileSignature}.</li> + * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li> + * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li> + * <li><tt>Title</tt>: of the URL</li> + * <li><tt>Outlinks</tt>: associated with the URL</li> + * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>, + * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, + * <i>Cache-Control</>, etc.</li> + * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>, + * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li> + * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing + * on <code>content.length</code> configuration.</li> + * </ol> + * + * @author John Xing + */ + +public class ParserChecker implements Tool { + + public static final Logger LOG = LoggerFactory.getLogger(ParserChecker.class); + private Configuration conf; + + public ParserChecker() { + } + + public int run(String[] args) throws Exception { + boolean dumpText = false; + boolean force = false; + String contentType = null; + String url = null; + + String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url"; + + if (args.length == 0) { + LOG.error(usage); + return (-1); + } + + // used to simulate the metadata propagated from injection + HashMap<String, String> metadata = new HashMap<String, String>(); + + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-forceAs")) { + force = true; + contentType = args[++i]; + } else if (args[i].equals("-dumpText")) { + dumpText = true; + } else if (args[i].equals("-md")) { + String k = null, v = null; + String nextOne = args[++i]; + int firstEquals = nextOne.indexOf("="); + if (firstEquals != -1) { + k = nextOne.substring(0, firstEquals); + v = nextOne.substring(firstEquals + 1); + } else + k = nextOne; + metadata.put(k, v); + } else if (i != args.length - 1) { + LOG.error(usage); + System.exit(-1); + } else { + url = URLUtil.toASCII(args[i]); + } + } + + if (LOG.isInfoEnabled()) { + LOG.info("fetching: " + url); + } + + CrawlDatum cd = new CrawlDatum(); + + Iterator<String> iter = metadata.keySet().iterator(); + while (iter.hasNext()) { + String key = iter.next(); + String value = metadata.get(key); + if (value == null) + value = ""; + cd.getMetaData().put(new Text(key), new Text(value)); + } + + ProtocolFactory factory = new ProtocolFactory(conf); + Protocol protocol = factory.getProtocol(url); + Text turl = new Text(url); + ProtocolOutput output = protocol.getProtocolOutput(turl, cd); + + // If the configuration permits, handle redirects until we either run + // out of allowed redirects or we stop getting redirect statuses. + int maxRedirects = conf.getInt("http.redirect.max", 0); + int numRedirects = 0; + while (output.getStatus().isRedirect() && numRedirects < maxRedirects) { + String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]); + LOG.info("Handling redirect to " + newURL); + + protocol = factory.getProtocol(newURL); + turl = new Text(newURL); + output = protocol.getProtocolOutput(turl, cd); + + numRedirects++; + } + + if (!output.getStatus().isSuccess()) { + System.err.println("Fetch failed with protocol status: " + + output.getStatus()); + + if (output.getStatus().isRedirect()) { + System.err.println("Redirect(s) not handled due to configuration."); + System.err.println("Max Redirects to handle per config: " + maxRedirects); + System.err.println("Number of Redirects handled: " + numRedirects); + } + return (-1); + } + + Content content = output.getContent(); + + if (content == null) { + LOG.error("No content for " + url); + return (-1); + } + + if (force) { + content.setContentType(contentType); + } else { + contentType = content.getContentType(); + } + + if (contentType == null) { + LOG.error("Failed to determine content type!"); + return (-1); + } + + if (ParseSegment.isTruncated(content)) { + LOG.warn("Content is truncated, parse may fail!"); + } + + ScoringFilters scfilters = new ScoringFilters(conf); + // call the scoring filters + try { + scfilters.passScoreBeforeParsing(turl, cd, content); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + + ")"); + LOG.warn(StringUtils.stringifyException(e)); + } + } + + ParseResult parseResult = new ParseUtil(conf).parse(content); + + if (parseResult == null) { + LOG.error("Parsing content failed!"); + return (-1); + } + + // Calculate the signature + byte[] signature = SignatureFactory.getSignature(getConf()).calculate( + content, parseResult.get(new Text(url))); + + if (LOG.isInfoEnabled()) { + LOG.info("parsing: " + url); + LOG.info("contentType: " + contentType); + LOG.info("signature: " + StringUtil.toHexString(signature)); + } + + Parse parse = parseResult.get(turl); + if (parse == null) { + LOG.error("Failed to get parse from parse result"); + LOG.error("Available parses in parse result (by URL key):"); + for (Map.Entry<Text, Parse> entry : parseResult) { + LOG.error(" " + entry.getKey()); + } + LOG.error("Parse result does not contain a parse for URL to be checked:"); + LOG.error(" " + turl); + return -1; + } + + // call the scoring filters + try { + scfilters.passScoreAfterParsing(turl, content, parse); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + + ")"); + LOG.warn(StringUtils.stringifyException(e)); + } + } + + for (Map.Entry<Text, Parse> entry : parseResult) { + parse = entry.getValue(); + LOG.info("---------\nUrl\n---------------\n"); + System.out.print(entry.getKey()); + LOG.info("\n---------\nParseData\n---------\n"); + System.out.print(parse.getData().toString()); + if (dumpText) { + LOG.info("---------\nParseText\n---------\n"); + System.out.print(parse.getText()); + } + } + + return 0; + } + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public void setConf(Configuration c) { + conf = c; + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new ParserChecker(), + args); + System.exit(res); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java new file mode 100644 index 0000000..0982de4 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java @@ -0,0 +1,428 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse; + +// JDK imports +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Vector; + +// Commons Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.ExtensionPoint; +import org.apache.nutch.plugin.PluginRuntimeException; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.MimeUtil; +import org.apache.nutch.util.ObjectCache; + +/** Creates and caches {@link Parser} plugins. */ +public final class ParserFactory { + + public static final Logger LOG = LoggerFactory.getLogger(ParserFactory.class); + + /** Wildcard for default plugins. */ + public static final String DEFAULT_PLUGIN = "*"; + + /** Empty extension list for caching purposes. */ + private final List<Extension> EMPTY_EXTENSION_LIST = Collections + .<Extension> emptyList(); + + private Configuration conf; + private ExtensionPoint extensionPoint; + private ParsePluginList parsePluginList; + + public ParserFactory(Configuration conf) { + this.conf = conf; + ObjectCache objectCache = ObjectCache.get(conf); + this.extensionPoint = PluginRepository.get(conf).getExtensionPoint( + Parser.X_POINT_ID); + this.parsePluginList = (ParsePluginList) objectCache + .getObject(ParsePluginList.class.getName()); + + if (this.parsePluginList == null) { + this.parsePluginList = new ParsePluginsReader().parse(conf); + objectCache.setObject(ParsePluginList.class.getName(), + this.parsePluginList); + } + + if (this.extensionPoint == null) { + throw new RuntimeException("x point " + Parser.X_POINT_ID + " not found."); + } + if (this.parsePluginList == null) { + throw new RuntimeException( + "Parse Plugins preferences could not be loaded."); + } + } + + /** + * Function returns an array of {@link Parser}s for a given content type. + * + * The function consults the internal list of parse plugins for the + * ParserFactory to determine the list of pluginIds, then gets the appropriate + * extension points to instantiate as {@link Parser}s. + * + * @param contentType + * The contentType to return the <code>Array</code> of {@link Parser} + * s for. + * @param url + * The url for the content that may allow us to get the type from the + * file suffix. + * @return An <code>Array</code> of {@link Parser}s for the given contentType. + * If there were plugins mapped to a contentType via the + * <code>parse-plugins.xml</code> file, but never enabled via the + * <code>plugin.includes</code> Nutch conf, then those plugins won't + * be part of this array, i.e., they will be skipped. So, if the + * ordered list of parsing plugins for <code>text/plain</code> was + * <code>[parse-text,parse-html, + * parse-rtf]</code>, and only <code>parse-html</code> and + * <code>parse-rtf</code> were enabled via + * <code>plugin.includes</code>, then this ordered Array would consist + * of two {@link Parser} interfaces, + * <code>[parse-html, parse-rtf]</code>. + */ + public Parser[] getParsers(String contentType, String url) + throws ParserNotFound { + + List<Parser> parsers = null; + List<Extension> parserExts = null; + + ObjectCache objectCache = ObjectCache.get(conf); + + // TODO once the MimeTypes is available + // parsers = getExtensions(MimeUtils.map(contentType)); + // if (parsers != null) { + // return parsers; + // } + // Last Chance: Guess content-type from file url... + // parsers = getExtensions(MimeUtils.getMimeType(url)); + + parserExts = getExtensions(contentType); + if (parserExts == null) { + throw new ParserNotFound(url, contentType); + } + + parsers = new Vector<Parser>(parserExts.size()); + for (Iterator<Extension> i = parserExts.iterator(); i.hasNext();) { + Extension ext = i.next(); + Parser p = null; + try { + // check to see if we've cached this parser instance yet + p = (Parser) objectCache.getObject(ext.getId()); + if (p == null) { + // go ahead and instantiate it and then cache it + p = (Parser) ext.getExtensionInstance(); + objectCache.setObject(ext.getId(), p); + } + parsers.add(p); + } catch (PluginRuntimeException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("ParserFactory:PluginRuntimeException when " + + "initializing parser plugin " + + ext.getDescriptor().getPluginId() + " instance in getParsers " + + "function: attempting to continue instantiating parsers"); + } + } + } + return parsers.toArray(new Parser[] {}); + } + + /** + * Function returns a {@link Parser} instance with the specified + * <code>extId</code>, representing its extension ID. If the Parser instance + * isn't found, then the function throws a <code>ParserNotFound</code> + * exception. If the function is able to find the {@link Parser} in the + * internal <code>PARSER_CACHE</code> then it will return the already + * instantiated Parser. Otherwise, if it has to instantiate the Parser itself + * , then this function will cache that Parser in the internal + * <code>PARSER_CACHE</code>. + * + * @param id + * The string extension ID (e.g., + * "org.apache.nutch.parse.rss.RSSParser", + * "org.apache.nutch.parse.rtf.RTFParseFactory") of the + * {@link Parser} implementation to return. + * @return A {@link Parser} implementation specified by the parameter + * <code>id</code>. + * @throws ParserNotFound + * If the Parser is not found (i.e., registered with the extension + * point), or if the there a {@link PluginRuntimeException} + * instantiating the {@link Parser}. + */ + public Parser getParserById(String id) throws ParserNotFound { + + Extension[] extensions = this.extensionPoint.getExtensions(); + Extension parserExt = null; + + ObjectCache objectCache = ObjectCache.get(conf); + + if (id != null) { + parserExt = getExtension(extensions, id); + } + if (parserExt == null) { + parserExt = getExtensionFromAlias(extensions, id); + } + + if (parserExt == null) { + throw new ParserNotFound("No Parser Found for id [" + id + "]"); + } + + // first check the cache + if (objectCache.getObject(parserExt.getId()) != null) { + return (Parser) objectCache.getObject(parserExt.getId()); + + // if not found in cache, instantiate the Parser + } else { + try { + Parser p = (Parser) parserExt.getExtensionInstance(); + objectCache.setObject(parserExt.getId(), p); + return p; + } catch (PluginRuntimeException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Canno initialize parser " + + parserExt.getDescriptor().getPluginId() + " (cause: " + + e.toString()); + } + throw new ParserNotFound("Cannot init parser for id [" + id + "]"); + } + } + } + + /** + * Finds the best-suited parse plugin for a given contentType. + * + * @param contentType + * Content-Type for which we seek a parse plugin. + * @return a list of extensions to be used for this contentType. If none, + * returns <code>null</code>. + */ + @SuppressWarnings("unchecked") + protected List<Extension> getExtensions(String contentType) { + + ObjectCache objectCache = ObjectCache.get(conf); + // First of all, tries to clean the content-type + String type = null; + type = MimeUtil.cleanMimeType(contentType); + + List<Extension> extensions = (List<Extension>) objectCache.getObject(type); + + // Just compare the reference: + // if this is the empty list, we know we will find no extension. + if (extensions == EMPTY_EXTENSION_LIST) { + return null; + } + + if (extensions == null) { + extensions = findExtensions(type); + if (extensions != null) { + objectCache.setObject(type, extensions); + } else { + // Put the empty extension list into cache + // to remember we don't know any related extension. + objectCache.setObject(type, EMPTY_EXTENSION_LIST); + } + } + return extensions; + } + + /** + * searches a list of suitable parse plugins for the given contentType. + * <p> + * It first looks for a preferred plugin defined in the parse-plugin file. If + * none is found, it returns a list of default plugins. + * + * @param contentType + * Content-Type for which we seek a parse plugin. + * @return List - List of extensions to be used for this contentType. If none, + * returns null. + */ + private List<Extension> findExtensions(String contentType) { + + Extension[] extensions = this.extensionPoint.getExtensions(); + + // Look for a preferred plugin. + List<String> parsePluginList = this.parsePluginList + .getPluginList(contentType); + List<Extension> extensionList = matchExtensions(parsePluginList, + extensions, contentType); + if (extensionList != null) { + return extensionList; + } + + // If none found, look for a default plugin. + parsePluginList = this.parsePluginList.getPluginList(DEFAULT_PLUGIN); + return matchExtensions(parsePluginList, extensions, DEFAULT_PLUGIN); + } + + /** + * Tries to find a suitable parser for the given contentType. + * <ol> + * <li>It checks if a parser which accepts the contentType can be found in the + * <code>plugins</code> list;</li> + * <li>If this list is empty, it tries to find amongst the loaded extensions + * whether some of them might suit and warns the user.</li> + * </ol> + * + * @param plugins + * List of candidate plugins. + * @param extensions + * Array of loaded extensions. + * @param contentType + * Content-Type for which we seek a parse plugin. + * @return List - List of extensions to be used for this contentType. If none, + * returns null. + */ + private List<Extension> matchExtensions(List<String> plugins, + Extension[] extensions, String contentType) { + + List<Extension> extList = new ArrayList<Extension>(); + if (plugins != null) { + + for (String parsePluginId : plugins) { + + Extension ext = getExtension(extensions, parsePluginId, contentType); + // the extension returned may be null + // that means that it was not enabled in the plugin.includes + // nutch conf property, but it was mapped in the + // parse-plugins.xml + // file. + // OR it was enabled in plugin.includes, but the plugin's plugin.xml + // file does not claim that the plugin supports the specified mimeType + // in either case, LOG the appropriate error message to WARN level + + if (ext == null) { + // try to get it just by its pluginId + ext = getExtension(extensions, parsePluginId); + + if (LOG.isWarnEnabled()) { + if (ext != null) { + // plugin was enabled via plugin.includes + // its plugin.xml just doesn't claim to support that + // particular mimeType + LOG.warn("ParserFactory:Plugin: " + parsePluginId + + " mapped to contentType " + contentType + + " via parse-plugins.xml, but " + "its plugin.xml " + + "file does not claim to support contentType: " + + contentType); + } else { + // plugin wasn't enabled via plugin.includes + LOG.warn("ParserFactory: Plugin: " + parsePluginId + + " mapped to contentType " + contentType + + " via parse-plugins.xml, but not enabled via " + + "plugin.includes in nutch-default.xml"); + } + } + } + + if (ext != null) { + // add it to the list + extList.add(ext); + } + } + + } else { + // okay, there were no list of plugins defined for + // this mimeType, however, there may be plugins registered + // via the plugin.includes nutch conf property that claim + // via their plugin.xml file to support this contentType + // so, iterate through the list of extensions and if you find + // any extensions where this is the case, throw a + // NotMappedParserException + + for (int i = 0; i < extensions.length; i++) { + if ("*".equals(extensions[i].getAttribute("contentType"))) { + extList.add(0, extensions[i]); + } else if (extensions[i].getAttribute("contentType") != null + && contentType.matches(escapeContentType(extensions[i] + .getAttribute("contentType")))) { + extList.add(extensions[i]); + } + } + + if (extList.size() > 0) { + if (LOG.isInfoEnabled()) { + StringBuffer extensionsIDs = new StringBuffer("["); + boolean isFirst = true; + for (Extension ext : extList) { + if (!isFirst) + extensionsIDs.append(" - "); + else + isFirst = false; + extensionsIDs.append(ext.getId()); + } + extensionsIDs.append("]"); + LOG.info("The parsing plugins: " + extensionsIDs.toString() + + " are enabled via the plugin.includes system " + + "property, and all claim to support the content type " + + contentType + ", but they are not mapped to it in the " + + "parse-plugins.xml file"); + } + } else if (LOG.isDebugEnabled()) { + LOG.debug("ParserFactory:No parse plugins mapped or enabled for " + + "contentType " + contentType); + } + } + + return (extList.size() > 0) ? extList : null; + } + + private String escapeContentType(String contentType) { + // Escapes contentType in order to use as a regex + // (and keep backwards compatibility). + // This enables to accept multiple types for a single parser. + return contentType.replace("+", "\\+").replace(".", "\\."); + } + + private boolean match(Extension extension, String id, String type) { + return ((id.equals(extension.getId())) && (extension.getAttribute( + "contentType").equals("*") + || type + .matches(escapeContentType(extension.getAttribute("contentType"))) || type + .equals(DEFAULT_PLUGIN))); + } + + /** Get an extension from its id and supported content-type. */ + private Extension getExtension(Extension[] list, String id, String type) { + for (int i = 0; i < list.length; i++) { + if (match(list[i], id, type)) { + return list[i]; + } + } + return null; + } + + private Extension getExtension(Extension[] list, String id) { + for (int i = 0; i < list.length; i++) { + if (id.equals(list[i].getId())) { + return list[i]; + } + } + return null; + } + + private Extension getExtensionFromAlias(Extension[] list, String id) { + return getExtension(list, parsePluginList.getAliases().get(id)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java new file mode 100644 index 0000000..2857efa --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse; + +public class ParserNotFound extends ParseException { + + private static final long serialVersionUID = 23993993939L; + private String url; + private String contentType; + + public ParserNotFound(String message) { + super(message); + } + + public ParserNotFound(String url, String contentType) { + this(url, contentType, "parser not found for contentType=" + contentType + + " url=" + url); + } + + public ParserNotFound(String url, String contentType, String message) { + super(message); + this.url = url; + this.contentType = contentType; + } + + public String getUrl() { + return url; + } + + public String getContentType() { + return contentType; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java b/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java new file mode 100644 index 0000000..40bd3e2 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * The {@link org.apache.nutch.parse.Parse Parse} interface and related classes. + */ +package org.apache.nutch.parse; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java b/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java new file mode 100644 index 0000000..f50c11a --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +/** + * <code>CircularDependencyException</code> will be thrown if a circular + * dependency is detected. + * + * @author Jérôme Charron + */ +public class CircularDependencyException extends Exception { + + private static final long serialVersionUID = 1L; + + public CircularDependencyException(Throwable cause) { + super(cause); + } + + public CircularDependencyException(String message) { + super(message); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java b/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java new file mode 100644 index 0000000..b0ee0af --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import java.util.HashMap; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configurable; + +/** + * An <code>Extension</code> is a kind of listener descriptor that will be + * installed on a concrete <code>ExtensionPoint</code> that acts as kind of + * Publisher. + */ +public class Extension { + private PluginDescriptor fDescriptor; + private String fId; + private String fTargetPoint; + private String fClazz; + private HashMap<String, String> fAttributes; + private Configuration conf; + + /** + * @param pDescriptor + * a plugin descriptor + * @param pExtensionPoint + * an extension porin + * @param pId + * an unique id of the plugin + */ + public Extension(PluginDescriptor pDescriptor, String pExtensionPoint, + String pId, String pExtensionClass, Configuration conf, + PluginRepository pluginRepository) { + fAttributes = new HashMap<String, String>(); + setDescriptor(pDescriptor); + setExtensionPoint(pExtensionPoint); + setId(pId); + setClazz(pExtensionClass); + this.conf = conf; + } + + /** + * @param point + */ + private void setExtensionPoint(String point) { + fTargetPoint = point; + } + + /** + * Returns a attribute value, that is setuped in the manifest file and is + * definied by the extension point xml schema. + * + * @param pKey + * a key + * @return String a value + */ + public String getAttribute(String pKey) { + return fAttributes.get(pKey); + } + + /** + * Returns the full class name of the extension point implementation + * + * @return String + */ + public String getClazz() { + return fClazz; + } + + /** + * Return the unique id of the extension. + * + * @return String + */ + public String getId() { + return fId; + } + + /** + * Adds a attribute and is only used until model creation at plugin system + * start up. + * + * @param pKey + * a key + * @param pValue + * a value + */ + public void addAttribute(String pKey, String pValue) { + fAttributes.put(pKey, pValue); + } + + /** + * Sets the Class that implement the concret extension and is only used until + * model creation at system start up. + * + * @param extensionClazz + * The extensionClasname to set + */ + public void setClazz(String extensionClazz) { + fClazz = extensionClazz; + } + + /** + * Sets the unique extension Id and is only used until model creation at + * system start up. + * + * @param extensionID + * The extensionID to set + */ + public void setId(String extensionID) { + fId = extensionID; + } + + /** + * Returns the Id of the extension point, that is implemented by this + * extension. + */ + public String getTargetPoint() { + return fTargetPoint; + } + + /** + * Return an instance of the extension implementatio. Before we create a + * extension instance we startup the plugin if it is not already done. The + * plugin instance and the extension instance use the same + * <code>PluginClassLoader</code>. Each Plugin use its own classloader. The + * PluginClassLoader knows only own <i>Plugin runtime libraries </i> setuped + * in the plugin manifest file and exported libraries of the depenedend + * plugins. + * + * @return Object An instance of the extension implementation + */ + public Object getExtensionInstance() throws PluginRuntimeException { + // Must synchronize here to make sure creation and initialization + // of a plugin instance and it extension instance are done by + // one and only one thread. + // The same is in PluginRepository.getPluginInstance(). + // Suggested by Stefan Groschupf <[email protected]> + synchronized (getId()) { + try { + PluginRepository pluginRepository = PluginRepository.get(conf); + Class<?> extensionClazz = pluginRepository.getCachedClass(fDescriptor, + getClazz()); + // lazy loading of Plugin in case there is no instance of the plugin + // already. + pluginRepository.getPluginInstance(getDescriptor()); + Object object = extensionClazz.newInstance(); + if (object instanceof Configurable) { + ((Configurable) object).setConf(this.conf); + } + return object; + } catch (ClassNotFoundException e) { + throw new PluginRuntimeException(e); + } catch (InstantiationException e) { + throw new PluginRuntimeException(e); + } catch (IllegalAccessException e) { + throw new PluginRuntimeException(e); + } + } + } + + /** + * return the plugin descriptor. + * + * @return PluginDescriptor + */ + public PluginDescriptor getDescriptor() { + return fDescriptor; + } + + /** + * Sets the plugin descriptor and is only used until model creation at system + * start up. + * + * @param pDescriptor + */ + public void setDescriptor(PluginDescriptor pDescriptor) { + fDescriptor = pDescriptor; + } +}
