Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Thu Jan 29 05:38:59 2015 @@ -47,7 +47,8 @@ import org.apache.hadoop.util.Progressab /* Parse content in a segment. */ public class ParseOutputFormat implements OutputFormat<Text, Parse> { - private static final Logger LOG = LoggerFactory.getLogger(ParseOutputFormat.class); + private static final Logger LOG = LoggerFactory + .getLogger(ParseOutputFormat.class); private URLFilters filters; private URLNormalizers normalizers; @@ -56,16 +57,16 @@ public class ParseOutputFormat implement private static class SimpleEntry implements Entry<Text, CrawlDatum> { private Text key; private CrawlDatum value; - + public SimpleEntry(Text key, CrawlDatum value) { this.key = key; this.value = value; } - + public Text getKey() { return key; } - + public CrawlDatum getValue() { return value; } @@ -77,93 +78,92 @@ public class ParseOutputFormat implement } public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException { - Path out = FileOutputFormat.getOutputPath(job); - if ((out == null) && (job.getNumReduceTasks() != 0)) { - throw new InvalidJobConfException( - "Output directory not set in JobConf."); - } - if (fs == null) { - fs = out.getFileSystem(job); - } - if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME))) - throw new IOException("Segment already parsed!"); + Path out = FileOutputFormat.getOutputPath(job); + if ((out == null) && (job.getNumReduceTasks() != 0)) { + throw new InvalidJobConfException("Output directory not set in JobConf."); + } + if (fs == null) { + fs = out.getFileSystem(job); + } + if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME))) + throw new IOException("Segment already parsed!"); } public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, - String name, Progressable progress) throws IOException { + String name, Progressable progress) throws IOException { - if(job.getBoolean("parse.filter.urls", true)) { + if (job.getBoolean("parse.filter.urls", true)) { filters = new URLFilters(job); } - if(job.getBoolean("parse.normalize.urls", true)) { + if (job.getBoolean("parse.normalize.urls", true)) { normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK); } this.scfilters = new ScoringFilters(job); final int interval = job.getInt("db.fetch.interval.default", 2592000); - final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false); + final boolean ignoreExternalLinks = job.getBoolean( + "db.ignore.external.links", false); int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100); final boolean isParsing = job.getBoolean("fetcher.parse", true); final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE - : maxOutlinksPerPage; - final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job); + : maxOutlinksPerPage; + final CompressionType compType = SequenceFileOutputFormat + .getOutputCompressionType(job); Path out = FileOutputFormat.getOutputPath(job); - + Path text = new Path(new Path(out, ParseText.DIR_NAME), name); Path data = new Path(new Path(out, ParseData.DIR_NAME), name); Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name); - - final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb","").split(" *, *"); - - final MapFile.Writer textOut = - new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class, - CompressionType.RECORD, progress); - - final MapFile.Writer dataOut = - new MapFile.Writer(job, fs, data.toString(), Text.class, ParseData.class, - compType, progress); - - final SequenceFile.Writer crawlOut = - SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class, - compType, progress); - + + final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "") + .split(" *, *"); + + final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(), + Text.class, ParseText.class, CompressionType.RECORD, progress); + + final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(), + Text.class, ParseData.class, compType, progress); + + final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job, + crawl, Text.class, CrawlDatum.class, compType, progress); + return new RecordWriter<Text, Parse>() { + public void write(Text key, Parse parse) throws IOException { - public void write(Text key, Parse parse) - throws IOException { - - String fromUrl = key.toString(); - String fromHost = null; - textOut.append(key, new ParseText(parse.getText())); - - ParseData parseData = parse.getData(); - // recover the signature prepared by Fetcher or ParseSegment - String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY); - if (sig != null) { - byte[] signature = StringUtil.fromHexString(sig); - if (signature != null) { - // append a CrawlDatum with a signature - CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0); - d.setSignature(signature); - crawlOut.append(key, d); - } + String fromUrl = key.toString(); + String fromHost = null; + textOut.append(key, new ParseText(parse.getText())); + + ParseData parseData = parse.getData(); + // recover the signature prepared by Fetcher or ParseSegment + String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY); + if (sig != null) { + byte[] signature = StringUtil.fromHexString(sig); + if (signature != null) { + // append a CrawlDatum with a signature + CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0); + d.setSignature(signature); + crawlOut.append(key, d); } - + } + // see if the parse metadata contain things that we'd like // to pass to the metadata of the crawlDB entry CrawlDatum parseMDCrawlDatum = null; for (String mdname : parseMDtoCrawlDB) { String mdvalue = parse.getData().getParseMeta().get(mdname); if (mdvalue != null) { - if (parseMDCrawlDatum == null) parseMDCrawlDatum = new CrawlDatum( - CrawlDatum.STATUS_PARSE_META, 0); + if (parseMDCrawlDatum == null) + parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, + 0); parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue)); } } - if (parseMDCrawlDatum != null) crawlOut.append(key, parseMDCrawlDatum); + if (parseMDCrawlDatum != null) + crawlOut.append(key, parseMDCrawlDatum); if (ignoreExternalLinks) { // need to determine fromHost (once for all outlinks) @@ -198,91 +198,96 @@ public class ParseOutputFormat implement } } - // collect outlinks for subsequent db update - Outlink[] links = parseData.getOutlinks(); - int outlinksToStore = Math.min(maxOutlinks, links.length); - - int validCount = 0; - CrawlDatum adjust = null; - List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(outlinksToStore); - List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); - for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { - String toUrl = links[i].getToUrl(); - - // Only normalize and filter if fetcher.parse = false - if (!isParsing) { - toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks, filters, normalizers); - if (toUrl == null) { - continue; - } - } - - CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); - Text targetUrl = new Text(toUrl); - - // see if the outlink has any metadata attached - // and if so pass that to the crawldatum so that - // the initial score or distribution can use that - MapWritable outlinkMD = links[i].getMetadata(); - if (outlinkMD!=null){ - target.getMetaData().putAll(outlinkMD); - } - - try { - scfilters.initialScore(targetUrl, target); - } catch (ScoringFilterException e) { - LOG.warn("Cannot filter init score for url " + key + - ", using default: " + e.getMessage()); - target.setScore(0.0f); + // collect outlinks for subsequent db update + Outlink[] links = parseData.getOutlinks(); + int outlinksToStore = Math.min(maxOutlinks, links.length); + + int validCount = 0; + CrawlDatum adjust = null; + List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>( + outlinksToStore); + List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); + for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { + String toUrl = links[i].getToUrl(); + + // Only normalize and filter if fetcher.parse = false + if (!isParsing) { + toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, + ignoreExternalLinks, filters, normalizers); + if (toUrl == null) { + continue; } + } - targets.add(new SimpleEntry(targetUrl, target)); + CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); + Text targetUrl = new Text(toUrl); - // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174) - links[i].setUrl(toUrl); - outlinkList.add(links[i]); - validCount++; + // see if the outlink has any metadata attached + // and if so pass that to the crawldatum so that + // the initial score or distribution can use that + MapWritable outlinkMD = links[i].getMetadata(); + if (outlinkMD != null) { + target.getMetaData().putAll(outlinkMD); } try { - // compute score contributions and adjustment to the original score - adjust = scfilters.distributeScoreToOutlinks(key, parseData, - targets, null, links.length); + scfilters.initialScore(targetUrl, target); } catch (ScoringFilterException e) { - LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage()); + LOG.warn("Cannot filter init score for url " + key + + ", using default: " + e.getMessage()); + target.setScore(0.0f); } - for (Entry<Text, CrawlDatum> target : targets) { - crawlOut.append(target.getKey(), target.getValue()); - } - if (adjust != null) crawlOut.append(key, adjust); - Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]); - parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), - filteredLinks, parseData.getContentMeta(), - parseData.getParseMeta()); - dataOut.append(key, parseData); - if (!parse.isCanonical()) { - CrawlDatum datum = new CrawlDatum(); - datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); - String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY); - try { - datum.setFetchTime(Long.parseLong(timeString)); - } catch (Exception e) { - LOG.warn("Can't read fetch time for: " + key); - datum.setFetchTime(System.currentTimeMillis()); - } - crawlOut.append(key, datum); - } + targets.add(new SimpleEntry(targetUrl, target)); + + // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174) + links[i].setUrl(toUrl); + outlinkList.add(links[i]); + validCount++; + } + + try { + // compute score contributions and adjustment to the original score + adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets, + null, links.length); + } catch (ScoringFilterException e) { + LOG.warn("Cannot distribute score from " + key + ": " + + e.getMessage()); } - - public void close(Reporter reporter) throws IOException { - textOut.close(); - dataOut.close(); - crawlOut.close(); + for (Entry<Text, CrawlDatum> target : targets) { + crawlOut.append(target.getKey(), target.getValue()); } - - }; - + if (adjust != null) + crawlOut.append(key, adjust); + + Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList + .size()]); + parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), + filteredLinks, parseData.getContentMeta(), parseData.getParseMeta()); + dataOut.append(key, parseData); + if (!parse.isCanonical()) { + CrawlDatum datum = new CrawlDatum(); + datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); + String timeString = parse.getData().getContentMeta() + .get(Nutch.FETCH_TIME_KEY); + try { + datum.setFetchTime(Long.parseLong(timeString)); + } catch (Exception e) { + LOG.warn("Can't read fetch time for: " + key); + datum.setFetchTime(System.currentTimeMillis()); + } + crawlOut.append(key, datum); + } + } + + public void close(Reporter reporter) throws IOException { + textOut.close(); + dataOut.close(); + crawlOut.close(); + } + + }; + } public static String filterNormalize(String fromUrl, String toUrl, @@ -311,12 +316,12 @@ public class ParseOutputFormat implement } } try { - if(normalizers != null) { - toUrl = normalizers.normalize(toUrl, - urlNormalizerScope); // normalize the url + if (normalizers != null) { + toUrl = normalizers.normalize(toUrl, urlNormalizerScope); // normalize + // the url } if (filters != null) { - toUrl = filters.filter(toUrl); // filter the url + toUrl = filters.filter(toUrl); // filter the url } if (toUrl == null) { return null;
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java Thu Jan 29 05:38:59 2015 @@ -22,25 +22,23 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - /** * This class represents a natural ordering for which parsing plugin should get * called for a particular mimeType. It provides methods to store the * parse-plugins.xml data, and methods to retreive the name of the appropriate * parsing plugin for a contentType. - * + * * @author mattmann * @version 1.0 */ class ParsePluginList { - + /* a map to link mimeType to an ordered list of parsing plugins */ private Map<String, List<String>> fMimeTypeToPluginMap = null; - + /* A list of aliases */ private Map<String, String> aliases = null; - - + /** * Constructs a new ParsePluginList */ @@ -48,7 +46,7 @@ class ParsePluginList { fMimeTypeToPluginMap = new HashMap<String, List<String>>(); aliases = new HashMap<String, String>(); } - + List<String> getPluginList(String mimeType) { return fMimeTypeToPluginMap.get(mimeType); } @@ -56,18 +54,18 @@ class ParsePluginList { void setAliases(Map<String, String> aliases) { this.aliases = aliases; } - + Map<String, String> getAliases() { return aliases; } - + void setPluginList(String mimeType, List<String> l) { fMimeTypeToPluginMap.put(mimeType, l); } - + List<String> getSupportedMimeTypes() { - return Arrays.asList(fMimeTypeToPluginMap.keySet().toArray( - new String[] {})); + return Arrays + .asList(fMimeTypeToPluginMap.keySet().toArray(new String[] {})); } - + } Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java Thu Jan 29 05:38:59 2015 @@ -42,50 +42,50 @@ import org.apache.hadoop.conf.Configurat // Nutch imports import org.apache.nutch.util.NutchConfiguration; - /** * A reader to load the information stored in the * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file. - * + * * @author mattmann * @version 1.0 */ class ParsePluginsReader { - + /* our log stream */ - public static final Logger LOG = LoggerFactory.getLogger(ParsePluginsReader.class); - + public static final Logger LOG = LoggerFactory + .getLogger(ParsePluginsReader.class); + /** The property name of the parse-plugins location */ private static final String PP_FILE_PROP = "parse.plugin.file"; /** the parse-plugins file */ private String fParsePluginsFile = null; - /** * Constructs a new ParsePluginsReader */ - public ParsePluginsReader() { } - + public ParsePluginsReader() { + } + /** * Reads the <code>parse-plugins.xml</code> file and returns the * {@link #ParsePluginList} defined by it. - * + * * @return A {@link #ParsePluginList} specified by the * <code>parse-plugins.xml</code> file. * @throws Exception - * If any parsing error occurs. + * If any parsing error occurs. */ public ParsePluginList parse(Configuration conf) { - + ParsePluginList pList = new ParsePluginList(); - + // open up the XML file DocumentBuilderFactory factory = null; DocumentBuilder parser = null; Document document = null; InputSource inputSource = null; - + InputStream ppInputStream = null; if (fParsePluginsFile != null) { URL parsePluginUrl = null; @@ -94,56 +94,55 @@ class ParsePluginsReader { ppInputStream = parsePluginUrl.openStream(); } catch (Exception e) { if (LOG.isWarnEnabled()) { - LOG.warn("Unable to load parse plugins file from URL " + - "[" + fParsePluginsFile + "]. Reason is [" + e + "]"); + LOG.warn("Unable to load parse plugins file from URL " + "[" + + fParsePluginsFile + "]. Reason is [" + e + "]"); } return pList; } } else { - ppInputStream = conf.getConfResourceAsInputStream( - conf.get(PP_FILE_PROP)); + ppInputStream = conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP)); } - + inputSource = new InputSource(ppInputStream); - + try { factory = DocumentBuilderFactory.newInstance(); parser = factory.newDocumentBuilder(); document = parser.parse(inputSource); } catch (Exception e) { if (LOG.isWarnEnabled()) { - LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + - "Reason is [" + e + "]"); + LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is [" + + e + "]"); } return null; } - + Element parsePlugins = document.getDocumentElement(); - + // build up the alias hash map Map<String, String> aliases = getAliases(parsePlugins); // And store it on the parse plugin list pList.setAliases(aliases); - + // get all the mime type nodes NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType"); - + // iterate through the mime types for (int i = 0; i < mimeTypes.getLength(); i++) { Element mimeType = (Element) mimeTypes.item(i); String mimeTypeStr = mimeType.getAttribute("name"); - + // for each mimeType, get the plugin list NodeList pluginList = mimeType.getElementsByTagName("plugin"); - + // iterate through the plugins, add them in order read // OR if they have a special order="" attribute, then hold those in // a separate list, and then insert them into the final list at the // order specified if (pluginList != null && pluginList.getLength() > 0) { List<String> plugList = new ArrayList<String>(pluginList.getLength()); - - for (int j = 0; j<pluginList.getLength(); j++) { + + for (int j = 0; j < pluginList.getLength(); j++) { Element plugin = (Element) pluginList.item(j); String pluginId = plugin.getAttribute("id"); String extId = aliases.get(pluginId); @@ -163,110 +162,110 @@ class ParsePluginsReader { plugList.add(extId); } } - + // now add the plugin list and map it to this mimeType pList.setPluginList(mimeTypeStr, plugList); - + } else if (LOG.isWarnEnabled()) { LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: " - + mimeTypeStr + ", continuing parse"); + + mimeTypeStr + ", continuing parse"); } } return pList; } - + /** * Tests parsing of the parse-plugins.xml file. An alternative name for the - * file can be specified via the <code>--file</code> option, although the - * file must be located in the <code>$NUTCH_HOME/conf</code> directory. - * + * file can be specified via the <code>--file</code> option, although the file + * must be located in the <code>$NUTCH_HOME/conf</code> directory. + * * @param args - * Currently only the --file argument to specify an alternative - * name for the parse-plugins.xml file is supported. + * Currently only the --file argument to specify an alternative name + * for the parse-plugins.xml file is supported. */ public static void main(String[] args) throws Exception { String parsePluginFile = null; String usage = "ParsePluginsReader [--file <parse plugin file location>]"; - - if (( args.length != 0 && args.length != 2 ) + + if ((args.length != 0 && args.length != 2) || (args.length == 2 && !"--file".equals(args[0]))) { System.err.println(usage); System.exit(1); } - + for (int i = 0; i < args.length; i++) { if (args[i].equals("--file")) { parsePluginFile = args[++i]; } } - + ParsePluginsReader reader = new ParsePluginsReader(); - + if (parsePluginFile != null) { reader.setFParsePluginsFile(parsePluginFile); } - + ParsePluginList prefs = reader.parse(NutchConfiguration.create()); - + for (String mimeType : prefs.getSupportedMimeTypes()) { - + System.out.println("MIMETYPE: " + mimeType); List<String> plugList = prefs.getPluginList(mimeType); - + System.out.println("EXTENSION IDs:"); - + for (String j : plugList) { System.out.println(j); } } - + } - + /** * @return Returns the fParsePluginsFile. */ public String getFParsePluginsFile() { return fParsePluginsFile; } - + /** * @param parsePluginsFile - * The fParsePluginsFile to set. + * The fParsePluginsFile to set. */ public void setFParsePluginsFile(String parsePluginsFile) { fParsePluginsFile = parsePluginsFile; } - + private Map<String, String> getAliases(Element parsePluginsRoot) { Map<String, String> aliases = new HashMap<String, String>(); NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases"); - + if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 0)) { if (LOG.isWarnEnabled()) { LOG.warn("No aliases defined in parse-plugins.xml!"); } return aliases; } - + if (aliasRoot.getLength() > 1) { // log a warning, but try and continue processing if (LOG.isWarnEnabled()) { LOG.warn("There should only be one \"aliases\" tag in parse-plugins.xml"); } } - - Element aliasRootElem = (Element)aliasRoot.item(0); + + Element aliasRootElem = (Element) aliasRoot.item(0); NodeList aliasElements = aliasRootElem.getElementsByTagName("alias"); - + if (aliasElements != null && aliasElements.getLength() > 0) { - for (int i=0; i<aliasElements.getLength(); i++) { - Element aliasElem = (Element)aliasElements.item(i); - String parsePluginId = aliasElem.getAttribute("name"); - String extensionId = aliasElem.getAttribute("extension-id"); + for (int i = 0; i < aliasElements.getLength(); i++) { + Element aliasElem = (Element) aliasElements.item(i); + String parsePluginId = aliasElem.getAttribute("name"); + String extensionId = aliasElem.getAttribute("extension-id"); if (LOG.isTraceEnabled()) { - LOG.trace("Found alias: plugin-id: " + parsePluginId + - ", extension-id: " + extensionId); + LOG.trace("Found alias: plugin-id: " + parsePluginId + + ", extension-id: " + extensionId); } if (parsePluginId != null && extensionId != null) { aliases.put(parsePluginId, extensionId); @@ -275,5 +274,5 @@ class ParsePluginsReader { } return aliases; } - + } Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java Thu Jan 29 05:38:59 2015 @@ -27,94 +27,116 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.io.Text; /** - * A utility class that stores result of a parse. Internally - * a ParseResult stores <{@link Text}, {@link Parse}> pairs. - * <p>Parsers may return multiple results, which correspond to parts - * or other associated documents related to the original URL.</p> - * <p>There will be usually one parse result that corresponds directly - * to the original URL, and possibly many (or none) results that correspond - * to derived URLs (or sub-URLs). + * A utility class that stores result of a parse. Internally a ParseResult + * stores <{@link Text}, {@link Parse}> pairs. + * <p> + * Parsers may return multiple results, which correspond to parts or other + * associated documents related to the original URL. + * </p> + * <p> + * There will be usually one parse result that corresponds directly to the + * original URL, and possibly many (or none) results that correspond to derived + * URLs (or sub-URLs). */ public class ParseResult implements Iterable<Map.Entry<Text, Parse>> { private Map<Text, Parse> parseMap; private String originalUrl; - + public static final Logger LOG = LoggerFactory.getLogger(ParseResult.class); - + /** * Create a container for parse results. - * @param originalUrl the original url from which all parse results - * have been obtained. + * + * @param originalUrl + * the original url from which all parse results have been obtained. */ public ParseResult(String originalUrl) { parseMap = new HashMap<Text, Parse>(); this.originalUrl = originalUrl; } - + /** * Convenience method for obtaining {@link ParseResult} from a single * <code>Parse</code> output. - * @param url canonical url. - * @param parse single parse output. + * + * @param url + * canonical url. + * @param parse + * single parse output. * @return result containing the single parse output. */ public static ParseResult createParseResult(String url, Parse parse) { ParseResult parseResult = new ParseResult(url); - parseResult.put(new Text(url), new ParseText(parse.getText()), parse.getData()); + parseResult.put(new Text(url), new ParseText(parse.getText()), + parse.getData()); return parseResult; } - + /** * Checks whether the result is empty. + * * @return */ public boolean isEmpty() { return parseMap.isEmpty(); } - + /** * Return the number of parse outputs (both successful and failed) */ public int size() { return parseMap.size(); } - + /** * Retrieve a single parse output. - * @param key sub-url under which the parse output is stored. + * + * @param key + * sub-url under which the parse output is stored. * @return parse output corresponding to this sub-url, or null. */ public Parse get(String key) { return get(new Text(key)); } - + /** * Retrieve a single parse output. - * @param key sub-url under which the parse output is stored. + * + * @param key + * sub-url under which the parse output is stored. * @return parse output corresponding to this sub-url, or null. */ public Parse get(Text key) { return parseMap.get(key); } - + /** * Store a result of parsing. - * @param key URL or sub-url of this parse result - * @param text plain text result - * @param data corresponding parse metadata of this result + * + * @param key + * URL or sub-url of this parse result + * @param text + * plain text result + * @param data + * corresponding parse metadata of this result */ public void put(Text key, ParseText text, ParseData data) { put(key.toString(), text, data); } - + /** * Store a result of parsing. - * @param key URL or sub-url of this parse result - * @param text plain text result - * @param data corresponding parse metadata of this result + * + * @param key + * URL or sub-url of this parse result + * @param text + * plain text result + * @param data + * corresponding parse metadata of this result */ public void put(String key, ParseText text, ParseData data) { - parseMap.put(new Text(key), new ParseImpl(text, data, key.equals(originalUrl))); + parseMap.put(new Text(key), + new ParseImpl(text, data, key.equals(originalUrl))); } /** @@ -123,21 +145,21 @@ public class ParseResult implements Iter public Iterator<Entry<Text, Parse>> iterator() { return parseMap.entrySet().iterator(); } - + /** - * Remove all results where status is not successful (as determined - * by </code>ParseStatus#isSuccess()</code>). Note that effects of this operation + * Remove all results where status is not successful (as determined by + * </code>ParseStatus#isSuccess()</code>). Note that effects of this operation * cannot be reversed. */ public void filter() { - for(Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) { + for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) { Entry<Text, Parse> entry = i.next(); if (!entry.getValue().getData().getStatus().isSuccess()) { LOG.warn(entry.getKey() + " is not parsed successfully, filtering"); i.remove(); } } - + } /** @@ -145,7 +167,7 @@ public class ParseResult implements Iter * Parse success is determined by <code>ParseStatus#isSuccess()</code>. */ public boolean isSuccess() { - for(Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) { + for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) { Entry<Text, Parse> entry = i.next(); if (!entry.getValue().getData().getStatus().isSuccess()) { return false; Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Thu Jan 29 05:38:59 2015 @@ -46,19 +46,19 @@ public class ParseSegment extends Config Reducer<Text, Writable, Text, Writable> { public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class); - + public static final String SKIP_TRUNCATED = "parser.skip.truncated"; - + private ScoringFilters scfilters; - + private ParseUtil parseUtil; - + private boolean skipTruncated; - + public ParseSegment() { this(null); } - + public ParseSegment(Configuration conf) { super(conf); } @@ -66,41 +66,43 @@ public class ParseSegment extends Config public void configure(JobConf job) { setConf(job); this.scfilters = new ScoringFilters(job); - skipTruncated=job.getBoolean(SKIP_TRUNCATED, true); + skipTruncated = job.getBoolean(SKIP_TRUNCATED, true); + } + + public void close() { } - public void close() {} - private Text newKey = new Text(); public void map(WritableComparable<?> key, Content content, - OutputCollector<Text, ParseImpl> output, Reporter reporter) - throws IOException { + OutputCollector<Text, ParseImpl> output, Reporter reporter) + throws IOException { // convert on the fly from old UTF8 keys if (key instanceof Text) { newKey.set(key.toString()); key = newKey; } - - int status = - Integer.parseInt(content.getMetadata().get(Nutch.FETCH_STATUS_KEY)); + + int status = Integer.parseInt(content.getMetadata().get( + Nutch.FETCH_STATUS_KEY)); if (status != CrawlDatum.STATUS_FETCH_SUCCESS) { // content not fetched successfully, skip document LOG.debug("Skipping " + key + " as content is not fetched successfully"); return; } - + if (skipTruncated && isTruncated(content)) { return; } ParseResult parseResult = null; try { - if (parseUtil == null) + if (parseUtil == null) parseUtil = new ParseUtil(getConf()); parseResult = parseUtil.parse(content); } catch (Exception e) { - LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); + LOG.warn("Error parsing: " + key + ": " + + StringUtils.stringifyException(e)); return; } @@ -111,7 +113,8 @@ public class ParseSegment extends Config long start = System.currentTimeMillis(); - reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[parseStatus.getMajorCode()], 1); + reporter.incrCounter("ParserStatus", + ParseStatus.majorCodes[parseStatus.getMajorCode()], 1); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); @@ -119,45 +122,51 @@ public class ParseSegment extends Config } // pass segment name to parse data - parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, - getConf().get(Nutch.SEGMENT_NAME_KEY)); + parse.getData().getContentMeta() + .set(Nutch.SEGMENT_NAME_KEY, getConf().get(Nutch.SEGMENT_NAME_KEY)); // compute the new signature - byte[] signature = - SignatureFactory.getSignature(getConf()).calculate(content, parse); - parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, - StringUtil.toHexString(signature)); - + byte[] signature = SignatureFactory.getSignature(getConf()).calculate( + content, parse); + parse.getData().getContentMeta() + .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); + try { scfilters.passScoreAfterParsing(url, content, parse); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { - LOG.warn("Error passing score: "+ url +": "+e.getMessage()); + LOG.warn("Error passing score: " + url + ": " + e.getMessage()); } } long end = System.currentTimeMillis(); LOG.info("Parsed (" + Long.toString(end - start) + "ms):" + url); - output.collect(url, new ParseImpl(new ParseText(parse.getText()), - parse.getData(), parse.isCanonical())); + output.collect( + url, + new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse + .isCanonical())); } } - + /** * Checks if the page's content is truncated. + * * @param content - * @return If the page is truncated <code>true</code>. When it is not, - * or when it could be determined, <code>false</code>. + * @return If the page is truncated <code>true</code>. When it is not, or when + * it could be determined, <code>false</code>. */ public static boolean isTruncated(Content content) { byte[] contentBytes = content.getContent(); - if (contentBytes == null) return false; + if (contentBytes == null) + return false; Metadata metadata = content.getMetadata(); - if (metadata == null) return false; - + if (metadata == null) + return false; + String lengthStr = metadata.get(Response.CONTENT_LENGTH); - if (lengthStr != null) lengthStr=lengthStr.trim(); + if (lengthStr != null) + lengthStr = lengthStr.trim(); if (StringUtil.isEmpty(lengthStr)) { return false; } @@ -176,14 +185,15 @@ public class ParseSegment extends Config return true; } if (LOG.isDebugEnabled()) { - LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize); + LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + + inHeaderSize); } return false; } public void reduce(Text key, Iterator<Writable> values, - OutputCollector<Text, Writable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, Writable> output, Reporter reporter) + throws IOException { output.collect(key, values.next()); // collect first value } @@ -204,7 +214,7 @@ public class ParseSegment extends Config job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ParseSegment.class); job.setReducerClass(ParseSegment.class); - + FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(ParseOutputFormat.class); job.setOutputKeyClass(Text.class); @@ -212,15 +222,16 @@ public class ParseSegment extends Config JobClient.runJob(job); long end = System.currentTimeMillis(); - LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } - public static void main(String[] args) throws Exception { - int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(), args); - System.exit(res); + int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(), + args); + System.exit(res); } - + public int run(String[] args) throws Exception { Path segment; @@ -231,11 +242,11 @@ public class ParseSegment extends Config System.exit(-1); } - if(args.length > 1) { - for(int i = 1; i < args.length; i++) { + if (args.length > 1) { + for (int i = 1; i < args.length; i++) { String param = args[i]; - if("-nofilter".equalsIgnoreCase(param)) { + if ("-nofilter".equalsIgnoreCase(param)) { getConf().setBoolean("parse.filter.urls", false); } else if ("-nonormalize".equalsIgnoreCase(param)) { getConf().setBoolean("parse.normalize.urls", false); Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Thu Jan 29 05:38:59 2015 @@ -1,19 +1,19 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ /* * Created on Apr 28, 2005 * Author: Andrzej Bialecki <[email protected]> @@ -32,113 +32,119 @@ import org.apache.hadoop.conf.Configurat import org.apache.nutch.metadata.Metadata; - /** * @author Andrzej Bialecki <[email protected]> */ public class ParseStatus implements Writable { - + private final static byte VERSION = 2; - + // Primary status codes: - + /** Parsing was not performed. */ - public static final byte NOTPARSED = 0; + public static final byte NOTPARSED = 0; /** Parsing succeeded. */ - public static final byte SUCCESS = 1; + public static final byte SUCCESS = 1; /** General failure. There may be a more specific error message in arguments. */ - public static final byte FAILED = 2; - - public static final String[] majorCodes = { - "notparsed", - "success", - "failed" - }; - + public static final byte FAILED = 2; + + public static final String[] majorCodes = { "notparsed", "success", "failed" }; + // Secondary success codes go here: - - /** Parsed content contains a directive to redirect to another URL. - * The target URL can be retrieved from the arguments. + + /** + * Parsed content contains a directive to redirect to another URL. The target + * URL can be retrieved from the arguments. */ - public static final short SUCCESS_REDIRECT = 100; - + public static final short SUCCESS_REDIRECT = 100; + // Secondary failure codes go here: - - /** Parsing failed. An Exception occured (which may be retrieved from the arguments). */ - public static final short FAILED_EXCEPTION = 200; - /** Parsing failed. Content was truncated, but the parser cannot handle incomplete content. */ - public static final short FAILED_TRUNCATED = 202; - /** Parsing failed. Invalid format - the content may be corrupted or of wrong type. */ - public static final short FAILED_INVALID_FORMAT = 203; - /** Parsing failed. Other related parts of the content are needed to complete + + /** + * Parsing failed. An Exception occured (which may be retrieved from the + * arguments). + */ + public static final short FAILED_EXCEPTION = 200; + /** + * Parsing failed. Content was truncated, but the parser cannot handle + * incomplete content. + */ + public static final short FAILED_TRUNCATED = 202; + /** + * Parsing failed. Invalid format - the content may be corrupted or of wrong + * type. + */ + public static final short FAILED_INVALID_FORMAT = 203; + /** + * Parsing failed. Other related parts of the content are needed to complete * parsing. The list of URLs to missing parts may be provided in arguments. * The Fetcher may decide to fetch these parts at once, then put them into * Content.metadata, and supply them for re-parsing. */ - public static final short FAILED_MISSING_PARTS = 204; - /** Parsing failed. There was no content to be parsed - probably caused - * by errors at protocol stage. + public static final short FAILED_MISSING_PARTS = 204; + /** + * Parsing failed. There was no content to be parsed - probably caused by + * errors at protocol stage. */ - public static final short FAILED_MISSING_CONTENT = 205; - + public static final short FAILED_MISSING_CONTENT = 205; public static final ParseStatus STATUS_NOTPARSED = new ParseStatus(NOTPARSED); public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS); public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED); - + private byte majorCode = 0; private short minorCode = 0; private String[] args = null; - + public byte getVersion() { return VERSION; } public ParseStatus() { - + } - + public ParseStatus(int majorCode, int minorCode, String[] args) { this.args = args; - this.majorCode = (byte)majorCode; - this.minorCode = (short)minorCode; + this.majorCode = (byte) majorCode; + this.minorCode = (short) minorCode; } - + public ParseStatus(int majorCode) { - this(majorCode, 0, (String[])null); + this(majorCode, 0, (String[]) null); } - + public ParseStatus(int majorCode, String[] args) { this(majorCode, 0, args); } - + public ParseStatus(int majorCode, int minorCode) { - this(majorCode, minorCode, (String[])null); + this(majorCode, minorCode, (String[]) null); } - + /** Simplified constructor for passing just a text message. */ public ParseStatus(int majorCode, int minorCode, String message) { - this(majorCode, minorCode, new String[]{message}); + this(majorCode, minorCode, new String[] { message }); } - + /** Simplified constructor for passing just a text message. */ public ParseStatus(int majorCode, String message) { - this(majorCode, 0, new String[]{message}); + this(majorCode, 0, new String[] { message }); } - + public ParseStatus(Throwable t) { - this(FAILED, FAILED_EXCEPTION, new String[]{t.toString()}); + this(FAILED, FAILED_EXCEPTION, new String[] { t.toString() }); } - + public static ParseStatus read(DataInput in) throws IOException { ParseStatus res = new ParseStatus(); res.readFields(in); return res; } - + public void readFields(DataInput in) throws IOException { byte version = in.readByte(); - switch(version) { + switch (version) { case 1: majorCode = in.readByte(); minorCode = in.readShort(); @@ -152,8 +158,8 @@ public class ParseStatus implements Writ default: throw new VersionMismatchException(VERSION, version); } - } - + } + public void write(DataOutput out) throws IOException { out.writeByte(VERSION); out.writeByte(majorCode); @@ -164,55 +170,61 @@ public class ParseStatus implements Writ WritableUtils.writeStringArray(out, args); } } - - /** A convenience method. Returns true if majorCode is SUCCESS, false + + /** + * A convenience method. Returns true if majorCode is SUCCESS, false * otherwise. */ - + public boolean isSuccess() { return majorCode == SUCCESS; } - - /** A convenience method. Return a String representation of the first - * argument, or null. + + /** + * A convenience method. Return a String representation of the first argument, + * or null. */ public String getMessage() { if (args != null && args.length > 0 && args[0] != null) return args[0]; return null; } - + public String[] getArgs() { return args; } - + public int getMajorCode() { return majorCode; } - + public int getMinorCode() { return minorCode; } - - /** A convenience method. Creates an empty Parse instance, - * which returns this status. + + /** + * A convenience method. Creates an empty Parse instance, which returns this + * status. */ public Parse getEmptyParse(Configuration conf) { return new EmptyParseImpl(this, conf); } - - /** A convenience method. Creates an empty ParseResult, - * which contains this status. + + /** + * A convenience method. Creates an empty ParseResult, which contains this + * status. */ public ParseResult getEmptyParseResult(String url, Configuration conf) { return ParseResult.createParseResult(url, getEmptyParse(conf)); } - + public String toString() { StringBuffer res = new StringBuffer(); String name = null; - if (majorCode >= 0 && majorCode < majorCodes.length) name = majorCodes[majorCode]; - else name = "UNKNOWN!"; + if (majorCode >= 0 && majorCode < majorCodes.length) + name = majorCodes[majorCode]; + else + name = "UNKNOWN!"; res.append(name + "(" + majorCode + "," + minorCode + ")"); if (args != null) { if (args.length == 1) { @@ -226,18 +238,18 @@ public class ParseStatus implements Writ } return res.toString(); } - + public void setArgs(String[] args) { this.args = args; } - + public void setMessage(String msg) { if (args == null || args.length == 0) { args = new String[1]; } args[0] = msg; } - + public void setMajorCode(byte majorCode) { this.majorCode = majorCode; } @@ -245,37 +257,45 @@ public class ParseStatus implements Writ public void setMinorCode(short minorCode) { this.minorCode = minorCode; } - + public boolean equals(Object o) { - if (o == null) return false; - if (!(o instanceof ParseStatus)) return false; + if (o == null) + return false; + if (!(o instanceof ParseStatus)) + return false; boolean res = true; - ParseStatus other = (ParseStatus)o; - res = res && (this.majorCode == other.majorCode) && - (this.minorCode == other.minorCode); - if (!res) return res; + ParseStatus other = (ParseStatus) o; + res = res && (this.majorCode == other.majorCode) + && (this.minorCode == other.minorCode); + if (!res) + return res; if (this.args == null) { - if (other.args == null) return true; - else return false; + if (other.args == null) + return true; + else + return false; } else { - if (other.args == null) return false; - if (other.args.length != this.args.length) return false; + if (other.args == null) + return false; + if (other.args.length != this.args.length) + return false; for (int i = 0; i < this.args.length; i++) { - if (!this.args[i].equals(other.args[i])) return false; + if (!this.args[i].equals(other.args[i])) + return false; } } return true; } - + private static class EmptyParseImpl implements Parse { - + private ParseData data = null; - + public EmptyParseImpl(ParseStatus status, Configuration conf) { - data = new ParseData(status, "", new Outlink[0], - new Metadata(), new Metadata()); + data = new ParseData(status, "", new Outlink[0], new Metadata(), + new Metadata()); } - + public ParseData getData() { return data; } @@ -283,10 +303,9 @@ public class ParseStatus implements Writ public String getText() { return ""; } - + public boolean isCanonical() { return true; } } } - Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java Thu Jan 29 05:38:59 2015 @@ -33,10 +33,12 @@ public final class ParseText implements private final static byte VERSION = 2; - public ParseText() {} + public ParseText() { + } + private String text; - - public ParseText(String text){ + + public ParseText(String text) { this.text = text; } @@ -68,12 +70,14 @@ public final class ParseText implements // // Accessor methods // - public String getText() { return text; } + public String getText() { + return text; + } public boolean equals(Object o) { if (!(o instanceof ParseText)) return false; - ParseText other = (ParseText)o; + ParseText other = (ParseText) o; return this.text.equals(other.text); } @@ -90,12 +94,11 @@ public final class ParseText implements } Options opts = new Options(); Configuration conf = NutchConfiguration.create(); - - GenericOptionsParser parser = - new GenericOptionsParser(conf, opts, argv); - + + GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv); + String[] remainingArgs = parser.getRemainingArgs(); - + FileSystem fs = FileSystem.get(conf); try { int recno = Integer.parseInt(remainingArgs[0]); Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Thu Jan 29 05:38:59 2015 @@ -30,127 +30,136 @@ import org.apache.nutch.protocol.Content import com.google.common.util.concurrent.ThreadFactoryBuilder; - /** * A Utility class containing methods to simply perform parsing utilities such * as iterating through a preferred list of {@link Parser}s to obtain * {@link Parse} objects. - * + * * @author mattmann * @author Jérôme Charron * @author Sébastien Le Callonnec */ public class ParseUtil { - + /* our log stream */ public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class); private ParserFactory parserFactory; /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/ private int maxParseTime = 30; private ExecutorService executorService; - + /** * * @param conf */ public ParseUtil(Configuration conf) { this.parserFactory = new ParserFactory(conf); - maxParseTime=conf.getInt("parser.timeout", 30); + maxParseTime = conf.getInt("parser.timeout", 30); executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder() - .setNameFormat("parse-%d").setDaemon(true).build()); + .setNameFormat("parse-%d").setDaemon(true).build()); } - + /** * Performs a parse by iterating through a List of preferred {@link Parser}s * until a successful parse is performed and a {@link Parse} object is * returned. If the parse is unsuccessful, a message is logged to the * <code>WARNING</code> level, and an empty parse is returned. - * - * @param content The content to try and parse. + * + * @param content + * The content to try and parse. * @return <key, {@link Parse}> pairs. - * @throws ParseException If no suitable parser is found to perform the parse. + * @throws ParseException + * If no suitable parser is found to perform the parse. */ public ParseResult parse(Content content) throws ParseException { Parser[] parsers = null; - + try { - parsers = this.parserFactory.getParsers(content.getContentType(), - content.getUrl() != null ? content.getUrl():""); + parsers = this.parserFactory.getParsers(content.getContentType(), + content.getUrl() != null ? content.getUrl() : ""); } catch (ParserNotFound e) { if (LOG.isWarnEnabled()) { - LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() + - " of type " + content.getContentType()); + LOG.warn("No suitable parser found when trying to parse content " + + content.getUrl() + " of type " + content.getContentType()); } throw new ParseException(e.getMessage()); } - + ParseResult parseResult = null; - for (int i=0; i<parsers.length; i++) { + for (int i = 0; i < parsers.length; i++) { if (LOG.isDebugEnabled()) { - LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]"); + LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + + "]"); } - if (maxParseTime!=-1) - parseResult = runParser(parsers[i], content); - else - parseResult = parsers[i].getParse(content); + if (maxParseTime != -1) + parseResult = runParser(parsers[i], content); + else + parseResult = parsers[i].getParse(content); if (parseResult != null && !parseResult.isEmpty()) return parseResult; } - - if (LOG.isWarnEnabled()) { - LOG.warn("Unable to successfully parse content " + content.getUrl() + - " of type " + content.getContentType()); - } - return new ParseStatus(new ParseException("Unable to successfully parse content")).getEmptyParseResult(content.getUrl(), null); + + if (LOG.isWarnEnabled()) { + LOG.warn("Unable to successfully parse content " + content.getUrl() + + " of type " + content.getContentType()); + } + return new ParseStatus(new ParseException( + "Unable to successfully parse content")).getEmptyParseResult( + content.getUrl(), null); } - + /** * Method parses a {@link Content} object using the {@link Parser} specified - * by the parameter <code>extId</code>, i.e., the Parser's extension ID. - * If a suitable {@link Parser} is not found, then a <code>WARNING</code> - * level message is logged, and a ParseException is thrown. If the parse is - * uncessful for any other reason, then a <code>WARNING</code> level - * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is - * returned. - * - * @param extId The extension implementation ID of the {@link Parser} to use - * to parse the specified content. - * @param content The content to parse. - * - * @return <key, {@link Parse}> pairs if the parse is successful, otherwise, - * a single <key, <code>ParseStatus.getEmptyParse()</code>> pair. - * - * @throws ParseException If there is no suitable {@link Parser} found - * to perform the parse. + * by the parameter <code>extId</code>, i.e., the Parser's extension ID. If a + * suitable {@link Parser} is not found, then a <code>WARNING</code> level + * message is logged, and a ParseException is thrown. If the parse is + * uncessful for any other reason, then a <code>WARNING</code> level message + * is logged, and a <code>ParseStatus.getEmptyParse()</code> is returned. + * + * @param extId + * The extension implementation ID of the {@link Parser} to use to + * parse the specified content. + * @param content + * The content to parse. + * + * @return <key, {@link Parse}> pairs if the parse is successful, + * otherwise, a single <key, + * <code>ParseStatus.getEmptyParse()</code>> pair. + * + * @throws ParseException + * If there is no suitable {@link Parser} found to perform the + * parse. */ public ParseResult parseByExtensionId(String extId, Content content) - throws ParseException { + throws ParseException { Parser p = null; - + try { p = this.parserFactory.getParserById(extId); } catch (ParserNotFound e) { if (LOG.isWarnEnabled()) { - LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() + - " of type " + content.getContentType()); + LOG.warn("No suitable parser found when trying to parse content " + + content.getUrl() + " of type " + content.getContentType()); } throw new ParseException(e.getMessage()); } - + ParseResult parseResult = null; - if (maxParseTime!=-1) - parseResult = runParser(p, content); - else - parseResult = p.getParse(content); + if (maxParseTime != -1) + parseResult = runParser(p, content); + else + parseResult = p.getParse(content); if (parseResult != null && !parseResult.isEmpty()) { return parseResult; } else { if (LOG.isWarnEnabled()) { - LOG.warn("Unable to successfully parse content " + content.getUrl() + - " of type " + content.getContentType()); - } - return new ParseStatus(new ParseException("Unable to successfully parse content")).getEmptyParseResult(content.getUrl(), null); + LOG.warn("Unable to successfully parse content " + content.getUrl() + + " of type " + content.getContentType()); + } + return new ParseStatus(new ParseException( + "Unable to successfully parse content")).getEmptyParseResult( + content.getUrl(), null); } } @@ -168,5 +177,5 @@ public class ParseUtil { } return res; } - + } Modified: nutch/trunk/src/java/org/apache/nutch/parse/Parser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/Parser.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/Parser.java Thu Jan 29 05:38:59 2015 @@ -24,33 +24,35 @@ import org.apache.hadoop.conf.Configurab import org.apache.nutch.plugin.Pluggable; import org.apache.nutch.protocol.Content; -/** A parser for content generated by a {@link org.apache.nutch.protocol.Protocol} - * implementation. This interface is implemented by extensions. Nutch's core - * contains no page parsing code. +/** + * A parser for content generated by a + * {@link org.apache.nutch.protocol.Protocol} implementation. This interface is + * implemented by extensions. Nutch's core contains no page parsing code. */ public interface Parser extends Pluggable, Configurable { /** The name of the extension point. */ public final static String X_POINT_ID = Parser.class.getName(); - /** + /** * <p> - * This method parses the given content and returns a map of - * <key, parse> pairs. {@link Parse} instances will be persisted - * under the given key. + * This method parses the given content and returns a map of <key, + * parse> pairs. {@link Parse} instances will be persisted under the given + * key. * </p> * <p> - * Note: Meta-redirects should be followed only when they are coming from - * the original URL. That is: <br> + * Note: Meta-redirects should be followed only when they are coming from the + * original URL. That is: <br> * Assume fetcher is in parsing mode and is currently processing - * foo.bar.com/redirect.html. If this url contains a meta redirect - * to another url, fetcher should only follow the redirect if the map - * contains an entry of the form <"foo.bar.com/redirect.html", - * {@link Parse} with a {@link ParseStatus} indicating the redirect>. + * foo.bar.com/redirect.html. If this url contains a meta redirect to another + * url, fetcher should only follow the redirect if the map contains an entry + * of the form <"foo.bar.com/redirect.html", {@link Parse} with a + * {@link ParseStatus} indicating the redirect>. * </p> * - * @param c Content to be parsed + * @param c + * Content to be parsed * @return a map containing <key, parse> pairs * @since NUTCH-443 */ - ParseResult getParse(Content c); + ParseResult getParse(Content c); } Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Thu Jan 29 05:38:59 2015 @@ -40,28 +40,30 @@ import org.apache.nutch.util.URLUtil; import org.apache.nutch.util.StringUtil; /** - * Parser checker, useful for testing parser. - * It also accurately reports possible fetching and - * parsing failures and presents protocol status signals to aid - * debugging. The tool enables us to retrieve the following data from - * any url: + * Parser checker, useful for testing parser. It also accurately reports + * possible fetching and parsing failures and presents protocol status signals + * to aid debugging. The tool enables us to retrieve the following data from any + * url: * <ol> - * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content} type.</li> - * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and is used to remove - * duplicates during the dedup procedure. - * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or + * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content} + * type.</li> + * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and + * is used to remove duplicates during the dedup procedure. It is calculated + * using {@link org.apache.nutch.crawl.MD5Signature} or * {@link org.apache.nutch.crawl.TextProfileSignature}.</li> * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li> * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li> * <li><tt>Title</tt>: of the URL</li> * <li><tt>Outlinks</tt>: associated with the URL</li> * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>, - * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, <i>Cache-Control</>, etc.</li> + * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, + * <i>Cache-Control</>, etc.</li> * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>, * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li> - * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing on - * <code>content.length</code> configuration.</li> + * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing + * on <code>content.length</code> configuration.</li> * </ol> + * * @author John Xing */ @@ -132,12 +134,13 @@ public class ParserChecker implements To Protocol protocol = factory.getProtocol(url); Text turl = new Text(url); ProtocolOutput output = protocol.getProtocolOutput(turl, cd); - + if (!output.getStatus().isSuccess()) { - System.err.println("Fetch failed with protocol status: " + output.getStatus()); + System.err.println("Fetch failed with protocol status: " + + output.getStatus()); return (-1); } - + Content content = output.getContent(); if (content == null) { @@ -166,11 +169,12 @@ public class ParserChecker implements To scfilters.passScoreBeforeParsing(turl, cd, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { - LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")"); + LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + + ")"); LOG.warn(StringUtils.stringifyException(e)); } - } - + } + ParseResult parseResult = new ParseUtil(conf).parse(content); if (parseResult == null) { @@ -179,8 +183,9 @@ public class ParserChecker implements To } // Calculate the signature - byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url))); - + byte[] signature = SignatureFactory.getSignature(getConf()).calculate( + content, parseResult.get(new Text(url))); + if (LOG.isInfoEnabled()) { LOG.info("parsing: " + url); LOG.info("contentType: " + contentType); @@ -204,7 +209,8 @@ public class ParserChecker implements To scfilters.passScoreAfterParsing(turl, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { - LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")"); + LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + + ")"); LOG.warn(StringUtils.stringifyException(e)); } }
