http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java new file mode 100644 index 0000000..78ccb27 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configurable; + +/** + * Interface used to convert URLs to normal form and optionally perform + * substitutions + */ +public interface URLNormalizer extends Configurable { + + /* Extension ID */ + public static final String X_POINT_ID = URLNormalizer.class.getName(); + + /* Interface for URL normalization */ + public String normalize(String urlString, String scope) + throws MalformedURLException; + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java new file mode 100644 index 0000000..d8f1c6e --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java @@ -0,0 +1,117 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.ExtensionPoint; +import org.apache.nutch.plugin.PluginRepository; + +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.util.NutchConfiguration; + +import java.io.BufferedReader; +import java.io.InputStreamReader; + +/** + * Checks one given normalizer or all normalizers. + */ +public class URLNormalizerChecker { + + private Configuration conf; + + public URLNormalizerChecker(Configuration conf) { + this.conf = conf; + } + + private void checkOne(String normalizerName, String scope) throws Exception { + URLNormalizer normalizer = null; + + ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( + URLNormalizer.X_POINT_ID); + + if (point == null) + throw new RuntimeException(URLNormalizer.X_POINT_ID + " not found."); + + Extension[] extensions = point.getExtensions(); + + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + normalizer = (URLNormalizer) extension.getExtensionInstance(); + if (normalizer.getClass().getName().equals(normalizerName)) { + break; + } else { + normalizer = null; + } + } + + if (normalizer == null) + throw new RuntimeException("URLNormalizer " + normalizerName + + " not found."); + + System.out.println("Checking URLNormalizer " + normalizerName); + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + while ((line = in.readLine()) != null) { + String out = normalizer.normalize(line, scope); + System.out.println(out); + } + } + + private void checkAll(String scope) throws Exception { + System.out.println("Checking combination of all URLNormalizers available"); + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + URLNormalizers normalizers = new URLNormalizers(conf, scope); + while ((line = in.readLine()) != null) { + String out = normalizers.normalize(line, scope); + System.out.println(out); + } + } + + public static void main(String[] args) throws Exception { + + String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] [-scope <scope>]" + + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink"; + + String normalizerName = null; + String scope = URLNormalizers.SCOPE_DEFAULT; + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-normalizer")) { + normalizerName = args[++i]; + } else if (args[i].equals("-scope")) { + scope = args[++i]; + } else { + System.err.println(usage); + System.exit(-1); + } + } + + URLNormalizerChecker checker = new URLNormalizerChecker( + NutchConfiguration.create()); + if (normalizerName != null) { + checker.checkOne(normalizerName, scope); + } else { + checker.checkAll(scope); + } + + System.exit(0); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java new file mode 100644 index 0000000..7a34353 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java @@ -0,0 +1,325 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.Vector; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.ExtensionPoint; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.plugin.PluginRuntimeException; +import org.apache.nutch.util.ObjectCache; + +/** + * This class uses a "chained filter" pattern to run defined normalizers. + * Different lists of normalizers may be defined for different "scopes", or + * contexts where they are used (note however that they need to be activated + * first through <tt>plugin.include</tt> property). + * + * <p> + * There is one global scope defined by default, which consists of all active + * normalizers. The order in which these normalizers are executed may be defined + * in "urlnormalizer.order" property, which lists space-separated implementation + * classes (if this property is missing normalizers will be run in random + * order). If there are more normalizers activated than explicitly named on this + * list, the remaining ones will be run in random order after the ones specified + * on the list are executed. + * </p> + * <p> + * You can define a set of contexts (or scopes) in which normalizers may be + * called. Each scope can have its own list of normalizers (defined in + * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in + * "urlnormalizer.order.<scope_name>" property). If any of these properties are + * missing, default settings are used for the global scope. + * </p> + * <p> + * In case no normalizers are required for any given scope, a + * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should + * be used. + * </p> + * <p> + * Each normalizer may further select among many configurations, depending on + * the scope in which it is called, because the scope name is passed as a + * parameter to each normalizer. You can also use the same normalizer for many + * scopes. + * </p> + * <p> + * Several scopes have been defined, and various Nutch tools will attempt using + * scope-specific normalizers first (and fall back to default config if + * scope-specific configuration is missing). + * </p> + * <p> + * Normalizers may be run several times, to ensure that modifications introduced + * by normalizers at the end of the list can be further reduced by normalizers + * executed at the beginning. By default this loop is executed just once - if + * you want to ensure that all possible combinations have been applied you may + * want to run this loop up to the number of activated normalizers. This loop + * count can be configured through <tt>urlnormalizer.loop.count</tt> property. + * As soon as the url is unchanged the loop will stop and return the result. + * </p> + * + * @author Andrzej Bialecki + */ +public final class URLNormalizers { + + /** + * Default scope. If no scope properties are defined then the configuration + * for this scope will be used. + */ + public static final String SCOPE_DEFAULT = "default"; + /** Scope used by {@link org.apache.nutch.crawl.URLPartitioner}. */ + public static final String SCOPE_PARTITION = "partition"; + /** Scope used by {@link org.apache.nutch.crawl.Generator}. */ + public static final String SCOPE_GENERATE_HOST_COUNT = "generate_host_count"; + /** + * Scope used by {@link org.apache.nutch.fetcher.Fetcher} when processing + * redirect URLs. + */ + public static final String SCOPE_FETCHER = "fetcher"; + /** Scope used when updating the CrawlDb with new URLs. */ + public static final String SCOPE_CRAWLDB = "crawldb"; + /** Scope used when updating the LinkDb with new URLs. */ + public static final String SCOPE_LINKDB = "linkdb"; + /** Scope used by {@link org.apache.nutch.crawl.Injector}. */ + public static final String SCOPE_INJECT = "inject"; + /** + * Scope used when constructing new {@link org.apache.nutch.parse.Outlink} + * instances. + */ + public static final String SCOPE_OUTLINK = "outlink"; + /** Scope used when indexing URLs. */ + public static final String SCOPE_INDEXER = "indexer"; + + public static final Logger LOG = LoggerFactory + .getLogger(URLNormalizers.class); + + /* Empty extension list for caching purposes. */ + private final List<Extension> EMPTY_EXTENSION_LIST = Collections + .<Extension> emptyList(); + + private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0]; + + private Configuration conf; + + private ExtensionPoint extensionPoint; + + private URLNormalizer[] normalizers; + + private int loopCount; + + public URLNormalizers(Configuration conf, String scope) { + this.conf = conf; + this.extensionPoint = PluginRepository.get(conf).getExtensionPoint( + URLNormalizer.X_POINT_ID); + ObjectCache objectCache = ObjectCache.get(conf); + + if (this.extensionPoint == null) { + throw new RuntimeException("x point " + URLNormalizer.X_POINT_ID + + " not found."); + } + + normalizers = (URLNormalizer[]) objectCache + .getObject(URLNormalizer.X_POINT_ID + "_" + scope); + if (normalizers == null) { + normalizers = getURLNormalizers(scope); + } + if (normalizers == EMPTY_NORMALIZERS) { + normalizers = (URLNormalizer[]) objectCache + .getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT); + if (normalizers == null) { + normalizers = getURLNormalizers(SCOPE_DEFAULT); + } + } + + loopCount = conf.getInt("urlnormalizer.loop.count", 1); + } + + /** + * Function returns an array of {@link URLNormalizer}s for a given scope, with + * a specified order. + * + * @param scope + * The scope to return the <code>Array</code> of + * {@link URLNormalizer}s for. + * @return An <code>Array</code> of {@link URLNormalizer}s for the given + * scope. + * @throws PluginRuntimeException + */ + URLNormalizer[] getURLNormalizers(String scope) { + List<Extension> extensions = getExtensions(scope); + ObjectCache objectCache = ObjectCache.get(conf); + + if (extensions == EMPTY_EXTENSION_LIST) { + return EMPTY_NORMALIZERS; + } + + List<URLNormalizer> normalizers = new Vector<URLNormalizer>( + extensions.size()); + + Iterator<Extension> it = extensions.iterator(); + while (it.hasNext()) { + Extension ext = it.next(); + URLNormalizer normalizer = null; + try { + // check to see if we've cached this URLNormalizer instance yet + normalizer = (URLNormalizer) objectCache.getObject(ext.getId()); + if (normalizer == null) { + // go ahead and instantiate it and then cache it + normalizer = (URLNormalizer) ext.getExtensionInstance(); + objectCache.setObject(ext.getId(), normalizer); + } + normalizers.add(normalizer); + } catch (PluginRuntimeException e) { + e.printStackTrace(); + LOG.warn("URLNormalizers:PluginRuntimeException when " + + "initializing url normalizer plugin " + + ext.getDescriptor().getPluginId() + + " instance in getURLNormalizers " + + "function: attempting to continue instantiating plugins"); + } + } + return normalizers.toArray(new URLNormalizer[normalizers.size()]); + } + + /** + * Finds the best-suited normalizer plugin for a given scope. + * + * @param scope + * Scope for which we seek a normalizer plugin. + * @return a list of extensions to be used for this scope. If none, returns + * empty list. + * @throws PluginRuntimeException + */ + @SuppressWarnings("unchecked") + private List<Extension> getExtensions(String scope) { + ObjectCache objectCache = ObjectCache.get(conf); + List<Extension> extensions = (List<Extension>) objectCache + .getObject(URLNormalizer.X_POINT_ID + "_x_" + scope); + + // Just compare the reference: + // if this is the empty list, we know we will find no extension. + if (extensions == EMPTY_EXTENSION_LIST) { + return EMPTY_EXTENSION_LIST; + } + + if (extensions == null) { + extensions = findExtensions(scope); + if (extensions != null) { + objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, + extensions); + } else { + // Put the empty extension list into cache + // to remember we don't know any related extension. + objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, + EMPTY_EXTENSION_LIST); + extensions = EMPTY_EXTENSION_LIST; + } + } + return extensions; + } + + /** + * searches a list of suitable url normalizer plugins for the given scope. + * + * @param scope + * Scope for which we seek a url normalizer plugin. + * @return List - List of extensions to be used for this scope. If none, + * returns null. + * @throws PluginRuntimeException + */ + private List<Extension> findExtensions(String scope) { + + String[] orders = null; + String orderlist = conf.get("urlnormalizer.order." + scope); + if (orderlist == null) + orderlist = conf.get("urlnormalizer.order"); + if (orderlist != null && !orderlist.trim().equals("")) { + orders = orderlist.trim().split("\\s+"); + } + String scopelist = conf.get("urlnormalizer.scope." + scope); + Set<String> impls = null; + if (scopelist != null && !scopelist.trim().equals("")) { + String[] names = scopelist.split("\\s+"); + impls = new HashSet<String>(Arrays.asList(names)); + } + Extension[] extensions = this.extensionPoint.getExtensions(); + HashMap<String, Extension> normalizerExtensions = new HashMap<String, Extension>(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (impls != null && !impls.contains(extension.getClazz())) + continue; + normalizerExtensions.put(extension.getClazz(), extension); + } + List<Extension> res = new ArrayList<Extension>(); + if (orders == null) { + res.addAll(normalizerExtensions.values()); + } else { + // first add those explicitly named in correct order + for (int i = 0; i < orders.length; i++) { + Extension e = normalizerExtensions.get(orders[i]); + if (e != null) { + res.add(e); + normalizerExtensions.remove(orders[i]); + } + } + // then add all others in random order + res.addAll(normalizerExtensions.values()); + } + return res; + } + + /** + * Normalize + * + * @param urlString + * The URL string to normalize. + * @param scope + * The given scope. + * @return A normalized String, using the given <code>scope</code> + * @throws MalformedURLException + * If the given URL string is malformed. + */ + public String normalize(String urlString, String scope) + throws MalformedURLException { + // optionally loop several times, and break if no further changes + String initialString = urlString; + for (int k = 0; k < loopCount; k++) { + for (int i = 0; i < this.normalizers.length; i++) { + if (urlString == null) + return null; + urlString = this.normalizers[i].normalize(urlString, scope); + } + if (initialString.equals(urlString)) + break; + initialString = urlString; + } + return urlString; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/package-info.java b/nutch-core/src/main/java/org/apache/nutch/net/package-info.java new file mode 100644 index 0000000..19e0111 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Web-related interfaces: URL {@link org.apache.nutch.net.URLFilter filters} + * and {@link org.apache.nutch.net.URLNormalizer normalizers}. + */ +package org.apache.nutch.net; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java b/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java new file mode 100644 index 0000000..5f4115b --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.protocols; + +import java.util.Calendar; +import java.util.Date; +import java.util.Locale; +import java.util.TimeZone; +import java.text.SimpleDateFormat; +import java.text.ParseException; + +/** + * class to handle HTTP dates. + * + * Modified from FastHttpDateFormat.java in jakarta-tomcat. + * + * @author John Xing + */ +public class HttpDateFormat { + + protected static SimpleDateFormat format = new SimpleDateFormat( + "EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US); + + /** + * HTTP date uses TimeZone GMT + */ + static { + format.setTimeZone(TimeZone.getTimeZone("GMT")); + } + + // HttpDate (long t) { + // } + + // HttpDate (String s) { + // } + + // /** + // * Get the current date in HTTP format. + // */ + // public static String getCurrentDate() { + // + // long now = System.currentTimeMillis(); + // if ((now - currentDateGenerated) > 1000) { + // synchronized (format) { + // if ((now - currentDateGenerated) > 1000) { + // currentDateGenerated = now; + // currentDate = format.format(new Date(now)); + // } + // } + // } + // return currentDate; + // + // } + + /** + * Get the HTTP format of the specified date. + */ + public static String toString(Date date) { + String string; + synchronized (format) { + string = format.format(date); + } + return string; + } + + public static String toString(Calendar cal) { + String string; + synchronized (format) { + string = format.format(cal.getTime()); + } + return string; + } + + public static String toString(long time) { + String string; + synchronized (format) { + string = format.format(new Date(time)); + } + return string; + } + + public static Date toDate(String dateString) throws ParseException { + Date date; + synchronized (format) { + date = format.parse(dateString); + } + return date; + } + + public static long toLong(String dateString) throws ParseException { + long time; + synchronized (format) { + time = format.parse(dateString).getTime(); + } + return time; + } + + public static void main(String[] args) throws Exception { + Date now = new Date(System.currentTimeMillis()); + + String string = HttpDateFormat.toString(now); + + long time = HttpDateFormat.toLong(string); + + System.out.println(string); + System.out.println(HttpDateFormat.toString(time)); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java b/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java new file mode 100644 index 0000000..0ae3776 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.protocols; + +import java.io.Serializable; + +/** + * Base exception for all protocol handlers + * + * @deprecated Use {@link org.apache.nutch.protocol.ProtocolException} instead. + */ +@Deprecated +@SuppressWarnings("serial") +public class ProtocolException extends Exception implements Serializable { + + public ProtocolException() { + super(); + } + + public ProtocolException(String message) { + super(message); + } + + public ProtocolException(String message, Throwable cause) { + super(message, cause); + } + + public ProtocolException(Throwable cause) { + super(cause); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java b/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java new file mode 100644 index 0000000..efff14b --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.protocols; + +// JDK imports +import java.net.URL; + +// Nutch imports +import org.apache.nutch.metadata.HttpHeaders; +import org.apache.nutch.metadata.Metadata; + +/** + * A response interface. Makes all protocols model HTTP. + */ +public interface Response extends HttpHeaders { + + /** Returns the URL used to retrieve this response. */ + public URL getUrl(); + + /** Returns the response code. */ + public int getCode(); + + /** Returns the value of a named header. */ + public String getHeader(String name); + + /** Returns all the headers. */ + public Metadata getHeaders(); + + /** Returns the full content of the response. */ + public byte[] getContent(); + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java b/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java new file mode 100644 index 0000000..8823f5b --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Helper classes related to the {@link org.apache.nutch.protocol.Protocol Protocol} + * interface, sea also {@link org.apache.nutch.protocol}. + */ +package org.apache.nutch.net.protocols; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java b/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java new file mode 100644 index 0000000..c36c036 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java @@ -0,0 +1,203 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.net.URL; +import java.util.Iterator; +import java.util.Properties; + +import org.apache.nutch.metadata.Metadata; + +/** + * This class holds the information about HTML "meta" tags extracted from a + * page. Some special tags have convenience methods for easy checking. + */ +public class HTMLMetaTags { + private boolean noIndex = false; + + private boolean noFollow = false; + + private boolean noCache = false; + + private URL baseHref = null; + + private boolean refresh = false; + + private int refreshTime = 0; + + private URL refreshHref = null; + + private Metadata generalTags = new Metadata(); + + private Properties httpEquivTags = new Properties(); + + /** + * Sets all boolean values to <code>false</code>. Clears all other tags. + */ + public void reset() { + noIndex = false; + noFollow = false; + noCache = false; + refresh = false; + refreshTime = 0; + baseHref = null; + refreshHref = null; + generalTags.clear(); + httpEquivTags.clear(); + } + + /** + * Sets <code>noFollow</code> to <code>true</code>. + */ + public void setNoFollow() { + noFollow = true; + } + + /** + * Sets <code>noIndex</code> to <code>true</code>. + */ + public void setNoIndex() { + noIndex = true; + } + + /** + * Sets <code>noCache</code> to <code>true</code>. + */ + public void setNoCache() { + noCache = true; + } + + /** + * Sets <code>refresh</code> to the supplied value. + */ + public void setRefresh(boolean refresh) { + this.refresh = refresh; + } + + /** + * Sets the <code>baseHref</code>. + */ + public void setBaseHref(URL baseHref) { + this.baseHref = baseHref; + } + + /** + * Sets the <code>refreshHref</code>. + */ + public void setRefreshHref(URL refreshHref) { + this.refreshHref = refreshHref; + } + + /** + * Sets the <code>refreshTime</code>. + */ + public void setRefreshTime(int refreshTime) { + this.refreshTime = refreshTime; + } + + /** + * A convenience method. Returns the current value of <code>noIndex</code>. + */ + public boolean getNoIndex() { + return noIndex; + } + + /** + * A convenience method. Returns the current value of <code>noFollow</code>. + */ + public boolean getNoFollow() { + return noFollow; + } + + /** + * A convenience method. Returns the current value of <code>noCache</code>. + */ + public boolean getNoCache() { + return noCache; + } + + /** + * A convenience method. Returns the current value of <code>refresh</code>. + */ + public boolean getRefresh() { + return refresh; + } + + /** + * A convenience method. Returns the <code>baseHref</code>, if set, or + * <code>null</code> otherwise. + */ + public URL getBaseHref() { + return baseHref; + } + + /** + * A convenience method. Returns the <code>refreshHref</code>, if set, or + * <code>null</code> otherwise. The value may be invalid if + * {@link #getRefresh()}returns <code>false</code>. + */ + public URL getRefreshHref() { + return refreshHref; + } + + /** + * A convenience method. Returns the current value of <code>refreshTime</code> + * . The value may be invalid if {@link #getRefresh()}returns + * <code>false</code>. + */ + public int getRefreshTime() { + return refreshTime; + } + + /** + * Returns all collected values of the general meta tags. Property names are + * tag names, property values are "content" values. + */ + public Metadata getGeneralTags() { + return generalTags; + } + + /** + * Returns all collected values of the "http-equiv" meta tags. Property names + * are tag names, property values are "content" values. + */ + public Properties getHttpEquivTags() { + return httpEquivTags; + } + + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append("base=" + baseHref + ", noCache=" + noCache + ", noFollow=" + + noFollow + ", noIndex=" + noIndex + ", refresh=" + refresh + + ", refreshHref=" + refreshHref + "\n"); + sb.append(" * general tags:\n"); + String[] names = generalTags.names(); + for (String name : names) { + String key = name; + sb.append(" - " + key + "\t=\t" + generalTags.get(key) + "\n"); + } + sb.append(" * http-equiv tags:\n"); + Iterator<Object> it = httpEquivTags.keySet().iterator(); + it = httpEquivTags.keySet().iterator(); + while (it.hasNext()) { + String key = (String) it.next(); + sb.append(" - " + key + "\t=\t" + httpEquivTags.get(key) + "\n"); + } + return sb.toString(); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java new file mode 100644 index 0000000..55b51ac --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +// JDK imports +import org.w3c.dom.DocumentFragment; + +// Hadoop imports +import org.apache.hadoop.conf.Configurable; + +// Nutch imports +import org.apache.nutch.plugin.Pluggable; +import org.apache.nutch.protocol.Content; + +/** + * Extension point for DOM-based HTML parsers. Permits one to add additional + * metadata to HTML parses. All plugins found which implement this extension + * point are run sequentially on the parse. + */ +public interface HtmlParseFilter extends Pluggable, Configurable { + /** The name of the extension point. */ + final static String X_POINT_ID = HtmlParseFilter.class.getName(); + + /** + * Adds metadata or otherwise modifies a parse of HTML content, given the DOM + * tree of a page. + */ + ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc); +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java new file mode 100644 index 0000000..9dd9aad --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import org.apache.nutch.protocol.Content; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.hadoop.conf.Configuration; + +import org.w3c.dom.DocumentFragment; + +/** Creates and caches {@link HtmlParseFilter} implementing plugins. */ +public class HtmlParseFilters { + + private HtmlParseFilter[] htmlParseFilters; + + public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order"; + + public HtmlParseFilters(Configuration conf) { + htmlParseFilters = (HtmlParseFilter[]) PluginRepository.get(conf) + .getOrderedPlugins(HtmlParseFilter.class, HtmlParseFilter.X_POINT_ID, + HTMLPARSEFILTER_ORDER); + } + + /** Run all defined filters. */ + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + + // loop on each filter + for (int i = 0; i < this.htmlParseFilters.length; i++) { + // call filter interface + parseResult = htmlParseFilters[i].filter(content, parseResult, metaTags, + doc); + + // any failure on parse obj, return + if (!parseResult.isSuccess()) { + // TODO: What happens when parseResult.isEmpty() ? + // Maybe clone parseResult and use parseResult as backup... + + // remove failed parse before return + parseResult.filter(); + return parseResult; + } + } + + return parseResult; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java b/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java new file mode 100644 index 0000000..3ee0354 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java @@ -0,0 +1,135 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.Map.Entry; + +import org.apache.hadoop.io.MapWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +/* An outgoing link from a page. */ +public class Outlink implements Writable { + + private String toUrl; + private String anchor; + private MapWritable md; + + public Outlink() { + } + + public Outlink(String toUrl, String anchor) throws MalformedURLException { + this.toUrl = toUrl; + if (anchor == null) + anchor = ""; + this.anchor = anchor; + md = null; + } + + public void readFields(DataInput in) throws IOException { + toUrl = Text.readString(in); + anchor = Text.readString(in); + boolean hasMD = in.readBoolean(); + if (hasMD) { + md = new org.apache.hadoop.io.MapWritable(); + md.readFields(in); + } else + md = null; + } + + /** Skips over one Outlink in the input. */ + public static void skip(DataInput in) throws IOException { + Text.skip(in); // skip toUrl + Text.skip(in); // skip anchor + boolean hasMD = in.readBoolean(); + if (hasMD) { + MapWritable metadata = new org.apache.hadoop.io.MapWritable(); + metadata.readFields(in); + ; + } + } + + public void write(DataOutput out) throws IOException { + Text.writeString(out, toUrl); + Text.writeString(out, anchor); + if (md != null && md.size() > 0) { + out.writeBoolean(true); + md.write(out); + } else { + out.writeBoolean(false); + } + } + + public static Outlink read(DataInput in) throws IOException { + Outlink outlink = new Outlink(); + outlink.readFields(in); + return outlink; + } + + public String getToUrl() { + return toUrl; + } + + public void setUrl(String toUrl) { + this.toUrl = toUrl; + } + + public String getAnchor() { + return anchor; + } + + public MapWritable getMetadata() { + return md; + } + + public void setMetadata(MapWritable md) { + this.md = md; + } + + public boolean equals(Object o) { + if (!(o instanceof Outlink)) + return false; + Outlink other = (Outlink) o; + return this.toUrl.equals(other.toUrl) && this.anchor.equals(other.anchor); + } + + public String toString() { + StringBuffer repr = new StringBuffer("toUrl: "); + repr.append(toUrl); + repr.append(" anchor: "); + repr.append(anchor); + if (md != null && !md.isEmpty()) { + for (Entry<Writable, Writable> e : md.entrySet()) { + repr.append(" "); + repr.append(e.getKey()); + repr.append(": "); + repr.append(e.getValue()); + } + } + return repr.toString(); + } + + @Override + public int hashCode() { + return toUrl.hashCode() ^ anchor.hashCode(); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java b/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java new file mode 100644 index 0000000..d1773f8 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java @@ -0,0 +1,145 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.oro.text.regex.MatchResult; +import org.apache.oro.text.regex.Pattern; +import org.apache.oro.text.regex.PatternCompiler; +import org.apache.oro.text.regex.PatternMatcher; +import org.apache.oro.text.regex.PatternMatcherInput; +import org.apache.oro.text.regex.Perl5Compiler; +import org.apache.oro.text.regex.Perl5Matcher; + +/** + * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from + * plain text using Regular Expressions. + * + * @see <a + * href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison + * of different regexp-Implementations </a> + * @see <a href="http://regex.info/java.html">Overview about Java Regexp APIs + * </a> + * + * @author Stephan Strittmatter - http://www.sybit.de + * @version 1.0 + * @since 0.7 + */ +public class OutlinkExtractor { + private static final Logger LOG = LoggerFactory + .getLogger(OutlinkExtractor.class); + + /** + * Regex pattern to get URLs within a plain text. + * + * @see <a + * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html + + * </a> + */ + private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; + + /** + * Extracts <code>Outlink</code> from given plain text. Applying this method + * to non-plain-text can result in extremely lengthy runtimes for parasitic + * cases (postscript is a known example). + * + * @param plainText + * the plain text from wich URLs should be extracted. + * + * @return Array of <code>Outlink</code>s within found in plainText + */ + public static Outlink[] getOutlinks(final String plainText, Configuration conf) { + return OutlinkExtractor.getOutlinks(plainText, "", conf); + } + + /** + * Extracts <code>Outlink</code> from given plain text and adds anchor to the + * extracted <code>Outlink</code>s + * + * @param plainText + * the plain text from wich URLs should be extracted. + * @param anchor + * the anchor of the url + * + * @return Array of <code>Outlink</code>s within found in plainText + */ + public static Outlink[] getOutlinks(final String plainText, String anchor, + Configuration conf) { + long start = System.currentTimeMillis(); + final List<Outlink> outlinks = new ArrayList<Outlink>(); + + try { + final PatternCompiler cp = new Perl5Compiler(); + final Pattern pattern = cp.compile(URL_PATTERN, + Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK + | Perl5Compiler.MULTILINE_MASK); + final PatternMatcher matcher = new Perl5Matcher(); + + final PatternMatcherInput input = new PatternMatcherInput(plainText); + + MatchResult result; + String url; + + // loop the matches + while (matcher.contains(input, pattern)) { + // if this is taking too long, stop matching + // (SHOULD really check cpu time used so that heavily loaded systems + // do not unnecessarily hit this limit.) + if (System.currentTimeMillis() - start >= 60000L) { + if (LOG.isWarnEnabled()) { + LOG.warn("Time limit exceeded for getOutLinks"); + } + break; + } + result = matcher.getMatch(); + url = result.group(0); + try { + outlinks.add(new Outlink(url, anchor)); + } catch (MalformedURLException mue) { + LOG.warn("Invalid url: '" + url + "', skipping."); + } + } + } catch (Exception ex) { + // if the matcher fails (perhaps a malformed URL) we just log it and move + // on + if (LOG.isErrorEnabled()) { + LOG.error("getOutlinks", ex); + } + } + + final Outlink[] retval; + + // create array of the Outlinks + if (outlinks != null && outlinks.size() > 0) { + retval = outlinks.toArray(new Outlink[0]); + } else { + retval = new Outlink[0]; + } + + return retval; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java b/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java new file mode 100644 index 0000000..9a33445 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +/** + * The result of parsing a page's raw content. + * + * @see Parser#getParse(Content) + */ +public interface Parse { + + /** + * The textual content of the page. This is indexed, searched, and used when + * generating snippets. + */ + String getText(); + + /** Other data extracted from the page. */ + ParseData getData(); + + /** Indicates if the parse is coming from a url or a sub-url */ + boolean isCanonical(); +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java new file mode 100644 index 0000000..12cae8a --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.util.concurrent.Callable; + +import org.apache.nutch.protocol.Content; + +class ParseCallable implements Callable<ParseResult> { + private Parser p; + private Content content; + + public ParseCallable(Parser p, Content content) { + this.p = p; + this.content = content; + } + + @Override + public ParseResult call() throws Exception { + return p.getParse(content); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java new file mode 100644 index 0000000..8189269 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java @@ -0,0 +1,255 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.io.*; +import java.util.*; + +import org.apache.commons.cli.Options; +import org.apache.hadoop.io.*; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.fs.FileSystem; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.util.NutchConfiguration; + +/** + * Data extracted from a page's content. + * + * @see Parse#getData() + */ +public final class ParseData extends VersionedWritable { + public static final String DIR_NAME = "parse_data"; + + private final static byte VERSION = 5; + + private String title; + private Outlink[] outlinks; + private Metadata contentMeta; + private Metadata parseMeta; + private ParseStatus status; + private byte version = VERSION; + + public ParseData() { + contentMeta = new Metadata(); + parseMeta = new Metadata(); + } + + public ParseData(ParseStatus status, String title, Outlink[] outlinks, + Metadata contentMeta) { + this(status, title, outlinks, contentMeta, new Metadata()); + } + + public ParseData(ParseStatus status, String title, Outlink[] outlinks, + Metadata contentMeta, Metadata parseMeta) { + this.status = status; + this.title = title; + this.outlinks = outlinks; + this.contentMeta = contentMeta; + this.parseMeta = parseMeta; + } + + // + // Accessor methods + // + + /** The status of parsing the page. */ + public ParseStatus getStatus() { + return status; + } + + /** The title of the page. */ + public String getTitle() { + return title; + } + + /** The outlinks of the page. */ + public Outlink[] getOutlinks() { + return outlinks; + } + + /** The original Metadata retrieved from content */ + public Metadata getContentMeta() { + return contentMeta; + } + + /** + * Other content properties. This is the place to find format-specific + * properties. Different parser implementations for different content types + * will populate this differently. + */ + public Metadata getParseMeta() { + return parseMeta; + } + + public void setParseMeta(Metadata parseMeta) { + this.parseMeta = parseMeta; + } + + public void setOutlinks(Outlink[] outlinks) { + this.outlinks = outlinks; + } + + /** + * Get a metadata single value. This method first looks for the metadata value + * in the parse metadata. If no value is found it the looks for the metadata + * in the content metadata. + * + * @see #getContentMeta() + * @see #getParseMeta() + */ + public String getMeta(String name) { + String value = parseMeta.get(name); + if (value == null) { + value = contentMeta.get(name); + } + return value; + } + + // + // Writable methods + // + + public byte getVersion() { + return version; + } + + public final void readFields(DataInput in) throws IOException { + + version = in.readByte(); + // incompatible change from UTF8 (version < 5) to Text + if (version != VERSION) + throw new VersionMismatchException(VERSION, version); + status = ParseStatus.read(in); + title = Text.readString(in); // read title + + int numOutlinks = in.readInt(); + outlinks = new Outlink[numOutlinks]; + for (int i = 0; i < numOutlinks; i++) { + outlinks[i] = Outlink.read(in); + } + + if (version < 3) { + int propertyCount = in.readInt(); // read metadata + contentMeta.clear(); + for (int i = 0; i < propertyCount; i++) { + contentMeta.add(Text.readString(in), Text.readString(in)); + } + } else { + contentMeta.clear(); + contentMeta.readFields(in); + } + if (version > 3) { + parseMeta.clear(); + parseMeta.readFields(in); + } + } + + public final void write(DataOutput out) throws IOException { + out.writeByte(VERSION); // write version + status.write(out); // write status + Text.writeString(out, title); // write title + + out.writeInt(outlinks.length); // write outlinks + for (int i = 0; i < outlinks.length; i++) { + outlinks[i].write(out); + } + contentMeta.write(out); // write content metadata + parseMeta.write(out); + } + + public static ParseData read(DataInput in) throws IOException { + ParseData parseText = new ParseData(); + parseText.readFields(in); + return parseText; + } + + // + // other methods + // + + public boolean equals(Object o) { + if (!(o instanceof ParseData)) + return false; + ParseData other = (ParseData) o; + return this.status.equals(other.status) && this.title.equals(other.title) + && Arrays.equals(this.outlinks, other.outlinks) + && this.contentMeta.equals(other.contentMeta) + && this.parseMeta.equals(other.parseMeta); + } + + public String toString() { + StringBuffer buffer = new StringBuffer(); + + buffer.append("Version: " + version + "\n"); + buffer.append("Status: " + status + "\n"); + buffer.append("Title: " + title + "\n"); + + if (outlinks != null) { + buffer.append("Outlinks: " + outlinks.length + "\n"); + for (int i = 0; i < outlinks.length; i++) { + buffer.append(" outlink: " + outlinks[i] + "\n"); + } + } + + buffer.append("Content Metadata: " + contentMeta + "\n"); + buffer.append("Parse Metadata: " + parseMeta + "\n"); + + return buffer.toString(); + } + + public static void main(String argv[]) throws Exception { + String usage = "ParseData (-local | -dfs <namenode:port>) recno segment"; + + if (argv.length < 3) { + System.out.println("usage:" + usage); + return; + } + + Options opts = new Options(); + Configuration conf = NutchConfiguration.create(); + + GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv); + + String[] remainingArgs = parser.getRemainingArgs(); + FileSystem fs = FileSystem.get(conf); + + try { + int recno = Integer.parseInt(remainingArgs[0]); + String segment = remainingArgs[1]; + + Path file = new Path(segment, DIR_NAME); + System.out.println("Reading from file: " + file); + + ArrayFile.Reader parses = new ArrayFile.Reader(fs, file.toString(), conf); + + ParseData parseDatum = new ParseData(); + parses.get(recno, parseDatum); + + System.out.println("Retrieved " + recno + " from file " + file); + System.out.println(parseDatum); + + parses.close(); + } finally { + fs.close(); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java new file mode 100644 index 0000000..3f27e33 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +@SuppressWarnings("serial") +public class ParseException extends Exception { + + public ParseException() { + super(); + } + + public ParseException(String message) { + super(message); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } + + public ParseException(Throwable cause) { + super(cause); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java new file mode 100644 index 0000000..dc72769 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.io.*; +import org.apache.hadoop.io.*; + +/** + * The result of parsing a page's raw content. + * + * @see Parser#getParse(Content) + */ +public class ParseImpl implements Parse, Writable { + private ParseText text; + private ParseData data; + private boolean isCanonical; + + public ParseImpl() { + } + + public ParseImpl(Parse parse) { + this(new ParseText(parse.getText()), parse.getData(), true); + } + + public ParseImpl(String text, ParseData data) { + this(new ParseText(text), data, true); + } + + public ParseImpl(ParseText text, ParseData data) { + this(text, data, true); + } + + public ParseImpl(ParseText text, ParseData data, boolean isCanonical) { + this.text = text; + this.data = data; + this.isCanonical = isCanonical; + } + + public String getText() { + return text.getText(); + } + + public ParseData getData() { + return data; + } + + public boolean isCanonical() { + return isCanonical; + } + + public final void write(DataOutput out) throws IOException { + out.writeBoolean(isCanonical); + text.write(out); + data.write(out); + } + + public void readFields(DataInput in) throws IOException { + isCanonical = in.readBoolean(); + text = new ParseText(); + text.readFields(in); + + data = new ParseData(); + data.readFields(in); + } + + public static ParseImpl read(DataInput in) throws IOException { + ParseImpl parseImpl = new ParseImpl(); + parseImpl.readFields(in); + return parseImpl; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java new file mode 100644 index 0000000..51b32fc --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java @@ -0,0 +1,398 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +// Commons Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.io.*; +import org.apache.hadoop.io.MapFile.Writer.Option; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.SequenceFile.Metadata; +import org.apache.hadoop.io.compress.DefaultCodec; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.mapred.*; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.fetcher.Fetcher; +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.StringUtil; +import org.apache.nutch.util.URLUtil; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.*; + +import java.io.*; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Map.Entry; + +import org.apache.hadoop.util.Progressable; + +/* Parse content in a segment. */ +public class ParseOutputFormat implements OutputFormat<Text, Parse> { + private static final Logger LOG = LoggerFactory + .getLogger(ParseOutputFormat.class); + private URLFilters filters; + private URLExemptionFilters exemptionFilters; + private URLNormalizers normalizers; + private ScoringFilters scfilters; + + private static class SimpleEntry implements Entry<Text, CrawlDatum> { + private Text key; + private CrawlDatum value; + + public SimpleEntry(Text key, CrawlDatum value) { + this.key = key; + this.value = value; + } + + public Text getKey() { + return key; + } + + public CrawlDatum getValue() { + return value; + } + + public CrawlDatum setValue(CrawlDatum value) { + this.value = value; + return this.value; + } + } + + public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException { + Path out = FileOutputFormat.getOutputPath(job); + if ((out == null) && (job.getNumReduceTasks() != 0)) { + throw new InvalidJobConfException("Output directory not set in JobConf."); + } + if (fs == null) { + fs = out.getFileSystem(job); + } + if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME))) + throw new IOException("Segment already parsed!"); + } + + public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, + String name, Progressable progress) throws IOException { + + if (job.getBoolean("parse.filter.urls", true)) { + filters = new URLFilters(job); + exemptionFilters = new URLExemptionFilters(job); + } + + if (job.getBoolean("parse.normalize.urls", true)) { + normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK); + } + + this.scfilters = new ScoringFilters(job); + final int interval = job.getInt("db.fetch.interval.default", 2592000); + final boolean ignoreInternalLinks = job.getBoolean( + "db.ignore.internal.links", false); + final boolean ignoreExternalLinks = job.getBoolean( + "db.ignore.external.links", false); + final String ignoreExternalLinksMode = job.get( + "db.ignore.external.links.mode", "byHost"); + + int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100); + final boolean isParsing = job.getBoolean("fetcher.parse", true); + final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE + : maxOutlinksPerPage; + final CompressionType compType = SequenceFileOutputFormat + .getOutputCompressionType(job); + Path out = FileOutputFormat.getOutputPath(job); + + Path text = new Path(new Path(out, ParseText.DIR_NAME), name); + Path data = new Path(new Path(out, ParseData.DIR_NAME), name); + Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name); + + final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "") + .split(" *, *"); + + // textOut Options + Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class); + org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class); + org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable(progress); + org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD); + + final MapFile.Writer textOut = new MapFile.Writer(job, text, + tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt); + + // dataOut Options + Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class); + org.apache.hadoop.io.SequenceFile.Writer.Option dValClassOpt = SequenceFile.Writer.valueClass(ParseData.class); + org.apache.hadoop.io.SequenceFile.Writer.Option dProgressOpt = SequenceFile.Writer.progressable(progress); + org.apache.hadoop.io.SequenceFile.Writer.Option dCompOpt = SequenceFile.Writer.compression(compType); + + final MapFile.Writer dataOut = new MapFile.Writer(job, data, + dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt); + + final SequenceFile.Writer crawlOut = SequenceFile.createWriter(job, SequenceFile.Writer.file(crawl), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(CrawlDatum.class), + SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)), + SequenceFile.Writer.replication(fs.getDefaultReplication(crawl)), + SequenceFile.Writer.blockSize(1073741824), + SequenceFile.Writer.compression(compType, new DefaultCodec()), + SequenceFile.Writer.progressable(progress), + SequenceFile.Writer.metadata(new Metadata())); + + return new RecordWriter<Text, Parse>() { + + public void write(Text key, Parse parse) throws IOException { + + String fromUrl = key.toString(); + // host or domain name of the source URL + String origin = null; + textOut.append(key, new ParseText(parse.getText())); + + ParseData parseData = parse.getData(); + // recover the signature prepared by Fetcher or ParseSegment + String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY); + if (sig != null) { + byte[] signature = StringUtil.fromHexString(sig); + if (signature != null) { + // append a CrawlDatum with a signature + CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0); + d.setSignature(signature); + crawlOut.append(key, d); + } + } + + // see if the parse metadata contain things that we'd like + // to pass to the metadata of the crawlDB entry + CrawlDatum parseMDCrawlDatum = null; + for (String mdname : parseMDtoCrawlDB) { + String mdvalue = parse.getData().getParseMeta().get(mdname); + if (mdvalue != null) { + if (parseMDCrawlDatum == null) + parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, + 0); + parseMDCrawlDatum.getMetaData().put(new Text(mdname), + new Text(mdvalue)); + } + } + if (parseMDCrawlDatum != null) + crawlOut.append(key, parseMDCrawlDatum); + + // need to determine origin (once for all outlinks) + if (ignoreExternalLinks || ignoreInternalLinks) { + URL originURL = new URL(fromUrl.toString()); + // based on domain? + if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { + origin = URLUtil.getDomainName(originURL).toLowerCase(); + } + // use host + else { + origin = originURL.getHost().toLowerCase(); + } + } + + ParseStatus pstatus = parseData.getStatus(); + if (pstatus != null && pstatus.isSuccess() + && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { + String newUrl = pstatus.getMessage(); + int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); + newUrl = filterNormalize(fromUrl, newUrl, origin, + ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers, + URLNormalizers.SCOPE_FETCHER); + + if (newUrl != null) { + String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl, + refreshTime < Fetcher.PERM_REFRESH_TIME); + CrawlDatum newDatum = new CrawlDatum(); + newDatum.setStatus(CrawlDatum.STATUS_LINKED); + if (reprUrl != null && !reprUrl.equals(newUrl)) { + newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, + new Text(reprUrl)); + } + crawlOut.append(new Text(newUrl), newDatum); + } + } + + // collect outlinks for subsequent db update + Outlink[] links = parseData.getOutlinks(); + int outlinksToStore = Math.min(maxOutlinks, links.length); + + int validCount = 0; + CrawlDatum adjust = null; + List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>( + outlinksToStore); + List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); + for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { + String toUrl = links[i].getToUrl(); + + // Only normalize and filter if fetcher.parse = false + if (!isParsing) { + toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin, + ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers); + if (toUrl == null) { + continue; + } + } + + CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); + Text targetUrl = new Text(toUrl); + + // see if the outlink has any metadata attached + // and if so pass that to the crawldatum so that + // the initial score or distribution can use that + MapWritable outlinkMD = links[i].getMetadata(); + if (outlinkMD != null) { + target.getMetaData().putAll(outlinkMD); + } + + try { + scfilters.initialScore(targetUrl, target); + } catch (ScoringFilterException e) { + LOG.warn("Cannot filter init score for url " + key + + ", using default: " + e.getMessage()); + target.setScore(0.0f); + } + + targets.add(new SimpleEntry(targetUrl, target)); + + // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174) + links[i].setUrl(toUrl); + outlinkList.add(links[i]); + validCount++; + } + + try { + // compute score contributions and adjustment to the original score + adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets, + null, links.length); + } catch (ScoringFilterException e) { + LOG.warn("Cannot distribute score from " + key + ": " + + e.getMessage()); + } + for (Entry<Text, CrawlDatum> target : targets) { + crawlOut.append(target.getKey(), target.getValue()); + } + if (adjust != null) + crawlOut.append(key, adjust); + + Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList + .size()]); + parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), + filteredLinks, parseData.getContentMeta(), parseData.getParseMeta()); + dataOut.append(key, parseData); + if (!parse.isCanonical()) { + CrawlDatum datum = new CrawlDatum(); + datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); + String timeString = parse.getData().getContentMeta() + .get(Nutch.FETCH_TIME_KEY); + try { + datum.setFetchTime(Long.parseLong(timeString)); + } catch (Exception e) { + LOG.warn("Can't read fetch time for: " + key); + datum.setFetchTime(System.currentTimeMillis()); + } + crawlOut.append(key, datum); + } + } + + public void close(Reporter reporter) throws IOException { + textOut.close(); + dataOut.close(); + crawlOut.close(); + } + + }; + + } + + public static String filterNormalize(String fromUrl, String toUrl, + String fromHost, boolean ignoreInternalLinks, boolean ignoreExternalLinks, + String ignoreExternalLinksMode, URLFilters filters, URLExemptionFilters exemptionFilters, + URLNormalizers normalizers) { + return filterNormalize(fromUrl, toUrl, fromHost, ignoreInternalLinks, ignoreExternalLinks, + ignoreExternalLinksMode, filters, exemptionFilters, normalizers, + URLNormalizers.SCOPE_OUTLINK); + } + + public static String filterNormalize(String fromUrl, String toUrl, + String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks, + String ignoreExternalLinksMode, URLFilters filters, + URLExemptionFilters exemptionFilters, URLNormalizers normalizers, + String urlNormalizerScope) { + // ignore links to self (or anchors within the page) + if (fromUrl.equals(toUrl)) { + return null; + } + if (ignoreExternalLinks || ignoreInternalLinks) { + URL targetURL = null; + try { + targetURL = new URL(toUrl); + } catch (MalformedURLException e1) { + return null; // skip it + } + if (ignoreExternalLinks) { + if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { + String toDomain = URLUtil.getDomainName(targetURL).toLowerCase(); + //FIXME: toDomain will never be null, correct? + if (toDomain == null || !toDomain.equals(origin)) { + return null; // skip it + } + } else { + String toHost = targetURL.getHost().toLowerCase(); + if (!toHost.equals(origin)) { // external host link + if (exemptionFilters == null // check if it is exempted? + || !exemptionFilters.isExempted(fromUrl, toUrl)) { + return null; ///skip it, This external url is not exempted. + } + } + } + } + if (ignoreInternalLinks) { + if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { + String toDomain = URLUtil.getDomainName(targetURL).toLowerCase(); + //FIXME: toDomain will never be null, correct? + if (toDomain == null || toDomain.equals(origin)) { + return null; // skip it + } + } else { + String toHost = targetURL.getHost().toLowerCase(); + //FIXME: toDomain will never be null, correct? + if (toHost == null || toHost.equals(origin)) { + return null; // skip it + } + } + } + } + + try { + if (normalizers != null) { + toUrl = normalizers.normalize(toUrl, urlNormalizerScope); // normalize + // the url + } + if (filters != null) { + toUrl = filters.filter(toUrl); // filter the url + } + if (toUrl == null) { + return null; + } + } catch (Exception e) { + return null; + } + + return toUrl; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java new file mode 100644 index 0000000..6ad0ac8 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse; + +// JDK imports +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * This class represents a natural ordering for which parsing plugin should get + * called for a particular mimeType. It provides methods to store the + * parse-plugins.xml data, and methods to retreive the name of the appropriate + * parsing plugin for a contentType. + * + * @author mattmann + * @version 1.0 + */ +class ParsePluginList { + + /* a map to link mimeType to an ordered list of parsing plugins */ + private Map<String, List<String>> fMimeTypeToPluginMap = null; + + /* A list of aliases */ + private Map<String, String> aliases = null; + + /** + * Constructs a new ParsePluginList + */ + ParsePluginList() { + fMimeTypeToPluginMap = new HashMap<String, List<String>>(); + aliases = new HashMap<String, String>(); + } + + List<String> getPluginList(String mimeType) { + return fMimeTypeToPluginMap.get(mimeType); + } + + void setAliases(Map<String, String> aliases) { + this.aliases = aliases; + } + + Map<String, String> getAliases() { + return aliases; + } + + void setPluginList(String mimeType, List<String> l) { + fMimeTypeToPluginMap.put(mimeType, l); + } + + List<String> getSupportedMimeTypes() { + return Arrays + .asList(fMimeTypeToPluginMap.keySet().toArray(new String[] {})); + } + +}
