http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java new file mode 100644 index 0000000..a73187b --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java @@ -0,0 +1,279 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +// JDK imports +import java.io.File; +import java.io.IOException; +import java.io.InputStream; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Tika imports +import org.apache.tika.Tika; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.mime.MimeTypesFactory; + +// Slf4j logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// imported for Javadoc +import org.apache.nutch.protocol.ProtocolOutput; + +/** + * @author mattmann + * @since NUTCH-608 + * + * <p> + * This is a facade class to insulate Nutch from its underlying Mime Type + * substrate library, <a href="http://incubator.apache.org/tika/">Apache + * Tika</a>. Any mime handling code should be placed in this utility + * class, and hidden from the Nutch classes that rely on it. + * </p> + */ +public final class MimeUtil { + + private static final String SEPARATOR = ";"; + + /* our Tika mime type registry */ + private MimeTypes mimeTypes; + + /* the tika detectors */ + private Tika tika; + + /* whether or not magic should be employed or not */ + private boolean mimeMagic; + + /* our log stream */ + private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class + .getName()); + + public MimeUtil(Configuration conf) { + tika = new Tika(); + ObjectCache objectCache = ObjectCache.get(conf); + MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class + .getName()); + if (mimeTypez == null) { + try { + String customMimeTypeFile = conf.get("mime.types.file"); + if (customMimeTypeFile != null + && customMimeTypeFile.equals("") == false) { + try { + LOG.info("Using custom mime.types.file: {}", customMimeTypeFile); + mimeTypez = MimeTypesFactory.create(conf + .getConfResourceAsInputStream(customMimeTypeFile)); + } catch (Exception e) { + LOG.error("Can't load mime.types.file : " + customMimeTypeFile + + " using Tika's default"); + } + } + if (mimeTypez == null) + mimeTypez = MimeTypes.getDefaultMimeTypes(); + } catch (Exception e) { + LOG.error("Exception in MimeUtil " + e.getMessage()); + throw new RuntimeException(e); + } + objectCache.setObject(MimeTypes.class.getName(), mimeTypez); + } + + this.mimeTypes = mimeTypez; + this.mimeMagic = conf.getBoolean("mime.type.magic", true); + } + + /** + * Cleans a {@link MimeType} name by removing out the actual {@link MimeType}, + * from a string of the form: + * + * <pre> + * <primary type>/<sub type> ; < optional params + * </pre> + * + * @param origType + * The original mime type string to be cleaned. + * @return The primary type, and subtype, concatenated, e.g., the actual mime + * type. + */ + public static String cleanMimeType(String origType) { + if (origType == null) + return null; + + // take the origType and split it on ';' + String[] tokenizedMimeType = origType.split(SEPARATOR); + if (tokenizedMimeType.length > 1) { + // there was a ';' in there, take the first value + return tokenizedMimeType[0]; + } else { + // there wasn't a ';', so just return the orig type + return origType; + } + } + + /** + * A facade interface to trying all the possible mime type resolution + * strategies available within Tika. First, the mime type provided in + * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. Then + * the cleaned mime type is looked up in the underlying Tika {@link MimeTypes} + * registry, by its cleaned name. If the {@link MimeType} is found, then that + * mime type is used, otherwise URL resolution is used to try and determine + * the mime type. However, if <code>mime.type.magic</code> is enabled in + * {@link NutchConfiguration}, then mime type magic resolution is used to try + * and obtain a better-than-the-default approximation of the {@link MimeType}. + * + * @param typeName + * The original mime type, returned from a {@link ProtocolOutput}. + * @param url + * The given @see url, that Nutch was trying to crawl. + * @param data + * The byte data, returned from the crawl, if any. + * @return The correctly, automatically guessed {@link MimeType} name. + */ + public String autoResolveContentType(String typeName, String url, byte[] data) { + String retType = null; + MimeType type = null; + String cleanedMimeType = null; + + cleanedMimeType = MimeUtil.cleanMimeType(typeName); + // first try to get the type from the cleaned type name + if (cleanedMimeType != null) { + try { + type = mimeTypes.forName(cleanedMimeType); + cleanedMimeType = type.getName(); + } catch (MimeTypeException mte) { + // Seems to be a malformed mime type name... + cleanedMimeType = null; + } + } + + // if returned null, or if it's the default type then try url resolution + if (type == null + || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) { + // If no mime-type header, or cannot find a corresponding registered + // mime-type, then guess a mime-type from the url pattern + try { + retType = tika.detect(url) != null ? tika.detect(url) : null; + } catch (Exception e) { + String message = "Problem loading default Tika configuration"; + LOG.error(message, e); + throw new RuntimeException(e); + } + } else { + retType = type.getName(); + } + + // if magic is enabled use mime magic to guess if the mime type returned + // from the magic guess is different than the one that's already set so far + // if it is, and it's not the default mime type, then go with the mime type + // returned by the magic + if (this.mimeMagic) { + String magicType = null; + // pass URL (file name) and (cleansed) content type from protocol to Tika + Metadata tikaMeta = new Metadata(); + tikaMeta.add(Metadata.RESOURCE_NAME_KEY, url); + tikaMeta.add(Metadata.CONTENT_TYPE, + (cleanedMimeType != null ? cleanedMimeType : typeName)); + try { + InputStream stream = TikaInputStream.get(data); + try { + magicType = mimeTypes.detect(stream, tikaMeta).toString(); + } finally { + stream.close(); + } + } catch (IOException ignore) { + } + + if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM) + && !magicType.equals(MimeTypes.PLAIN_TEXT) && retType != null + && !retType.equals(magicType)) { + + // If magic enabled and the current mime type differs from that of the + // one returned from the magic, take the magic mimeType + retType = magicType; + } + + // if type is STILL null after all the resolution strategies, go for the + // default type + if (retType == null) { + try { + retType = MimeTypes.OCTET_STREAM; + } catch (Exception ignore) { + } + } + } + + return retType; + } + + /** + * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)} + * method. + * + * @param url + * A string representation of the document {@link URL} to sense the + * {@link MimeType} for. + * @return An appropriate {@link MimeType}, identified from the given Document + * url in string form. + */ + public String getMimeType(String url) { + return tika.detect(url); + } + + /** + * A facade interface to Tika's underlying {@link MimeTypes#forName(String)} + * method. + * + * @param name + * The name of a valid {@link MimeType} in the Tika mime registry. + * @return The object representation of the {@link MimeType}, if it exists, or + * null otherwise. + */ + public String forName(String name) { + try { + return this.mimeTypes.forName(name).toString(); + } catch (MimeTypeException e) { + LOG.error("Exception getting mime type by name: [" + name + + "]: Message: " + e.getMessage()); + return null; + } + } + + /** + * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)} + * method. + * + * @param f + * The {@link File} to sense the {@link MimeType} for. + * @return The {@link MimeType} of the given {@link File}, or null if it + * cannot be determined. + */ + public String getMimeType(File f) { + try { + return tika.detect(f); + } catch (Exception e) { + LOG.error("Exception getting mime type for file: [" + f.getPath() + + "]: Message: " + e.getMessage()); + return null; + } + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java b/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java new file mode 100644 index 0000000..c99bae0 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.util; + +import java.util.Stack; + +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +/** + * <p> + * A utility class that allows the walking of any DOM tree using a stack instead + * of recursion. As the node tree is walked the next node is popped off of the + * stack and all of its children are automatically added to the stack to be + * called in tree order. + * </p> + * + * <p> + * Currently this class is not thread safe. It is assumed that only one thread + * will be accessing the <code>NodeWalker</code> at any given time. + * </p> + */ +public class NodeWalker { + + // the root node the the stack holding the nodes + private Node currentNode; + private NodeList currentChildren; + private Stack<Node> nodes; + + /** + * Starts the <code>Node</code> tree from the root node. + * + * @param rootNode + */ + public NodeWalker(Node rootNode) { + + nodes = new Stack<Node>(); + nodes.add(rootNode); + } + + /** + * <p> + * Returns the next <code>Node</code> on the stack and pushes all of its + * children onto the stack, allowing us to walk the node tree without the use + * of recursion. If there are no more nodes on the stack then null is + * returned. + * </p> + * + * @return Node The next <code>Node</code> on the stack or null if there isn't + * a next node. + */ + public Node nextNode() { + + // if no next node return null + if (!hasNext()) { + return null; + } + + // pop the next node off of the stack and push all of its children onto + // the stack + currentNode = nodes.pop(); + currentChildren = currentNode.getChildNodes(); + int childLen = (currentChildren != null) ? currentChildren.getLength() : 0; + + // put the children node on the stack in first to last order + for (int i = childLen - 1; i >= 0; i--) { + nodes.add(currentChildren.item(i)); + } + + return currentNode; + } + + /** + * <p> + * Skips over and removes from the node stack the children of the last node. + * When getting a next node from the walker, that node's children are + * automatically added to the stack. You can call this method to remove those + * children from the stack. + * </p> + * + * <p> + * This is useful when you don't want to process deeper into the current path + * of the node tree but you want to continue processing sibling nodes. + * </p> + * + */ + public void skipChildren() { + + int childLen = (currentChildren != null) ? currentChildren.getLength() : 0; + + for (int i = 0; i < childLen; i++) { + Node child = nodes.peek(); + if (child.equals(currentChildren.item(i))) { + nodes.pop(); + } + } + } + + /** + * Return the current node. + * + * @return Node + */ + public Node getCurrentNode() { + return currentNode; + } + + /** + * @return returns true if there are more nodes on the current stack. + * + */ + public boolean hasNext() { + return (nodes.size() > 0); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java new file mode 100644 index 0000000..ac71a93 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.util.Map.Entry; +import java.util.Properties; +import java.util.UUID; + +import org.apache.hadoop.conf.Configuration; + +/** + * Utility to create Hadoop {@link Configuration}s that include Nutch-specific + * resources. + */ +public class NutchConfiguration { + public static final String UUID_KEY = "nutch.conf.uuid"; + + private NutchConfiguration() { + } // singleton + + /* + * Configuration.hashCode() doesn't return values that correspond to a unique + * set of parameters. This is a workaround so that we can track instances of + * Configuration created by Nutch. + */ + private static void setUUID(Configuration conf) { + UUID uuid = UUID.randomUUID(); + conf.set(UUID_KEY, uuid.toString()); + } + + /** + * Retrieve a Nutch UUID of this configuration object, or null if the + * configuration was created elsewhere. + * + * @param conf + * configuration instance + * @return uuid or null + */ + public static String getUUID(Configuration conf) { + return conf.get(UUID_KEY); + } + + /** + * Create a {@link Configuration} for Nutch. This will load the standard Nutch + * resources, <code>nutch-default.xml</code> and <code>nutch-site.xml</code> + * overrides. + */ + public static Configuration create() { + Configuration conf = new Configuration(); + setUUID(conf); + addNutchResources(conf); + return conf; + } + + /** + * Create a {@link Configuration} from supplied properties. + * + * @param addNutchResources + * if true, then first <code>nutch-default.xml</code>, and then + * <code>nutch-site.xml</code> will be loaded prior to applying the + * properties. Otherwise these resources won't be used. + * @param nutchProperties + * a set of properties to define (or override) + */ + public static Configuration create(boolean addNutchResources, + Properties nutchProperties) { + Configuration conf = new Configuration(); + setUUID(conf); + if (addNutchResources) { + addNutchResources(conf); + } + for (Entry<Object, Object> e : nutchProperties.entrySet()) { + conf.set(e.getKey().toString(), e.getValue().toString()); + } + return conf; + } + + /** + * Add the standard Nutch resources to {@link Configuration}. + * + * @param conf + * Configuration object to which configuration is to be added. + */ + private static Configuration addNutchResources(Configuration conf) { + conf.addResource("nutch-default.xml"); + conf.addResource("nutch-site.xml"); + return conf; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java new file mode 100644 index 0000000..8b4f8e0 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapred.JobConf; + +/** A {@link JobConf} for Nutch jobs. */ +public class NutchJob extends JobConf { + + public NutchJob(Configuration conf) { + super(conf, NutchJob.class); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java new file mode 100644 index 0000000..8e75177 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.mapreduce.Job; +import org.apache.nutch.metadata.Nutch; + +public abstract class NutchTool extends Configured { + + protected HashMap<String, Object> results = new HashMap<String, Object>(); + protected Map<String, Object> status = Collections + .synchronizedMap(new HashMap<String, Object>()); + protected Job currentJob; + protected int numJobs; + protected int currentJobNum; + + /** + * Runs the tool, using a map of arguments. May return results, or null. + */ + public abstract Map<String, Object> run(Map<String, Object> args, String crawlId) + throws Exception; + + public NutchTool(Configuration conf){ + super(conf); + } + + public NutchTool(){ + super(null); + } + + /** Returns relative progress of the tool, a float in range [0,1]. */ + public float getProgress() { + float res = 0; + if (currentJob != null) { + try { + res = (currentJob.mapProgress() + currentJob.reduceProgress()) / 2.0f; + } catch (IOException e) { + e.printStackTrace(); + res = 0; + } catch (IllegalStateException ile) { + ile.printStackTrace(); + res = 0; + } + } + // take into account multiple jobs + if (numJobs > 1) { + res = (currentJobNum + res) / (float) numJobs; + } + status.put(Nutch.STAT_PROGRESS, res); + return res; + } + + /** Returns current status of the running tool. */ + public Map<String, Object> getStatus() { + return status; + } + + /** + * Stop the job with the possibility to resume. Subclasses should override + * this, since by default it calls {@link #killJob()}. + * + * @return true if succeeded, false otherwise + */ + public boolean stopJob() throws Exception { + return killJob(); + } + + /** + * Kill the job immediately. Clients should assume that any results that the + * job produced so far are in inconsistent state or missing. + * + * @return true if succeeded, false otherwise. + * @throws Exception + */ + public boolean killJob() throws Exception { + if (currentJob != null && !currentJob.isComplete()) { + try { + currentJob.killJob(); + return true; + } catch (Exception e) { + e.printStackTrace(); + return false; + } + } + return false; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java b/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java new file mode 100644 index 0000000..0277ee6 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.util; + +import java.util.HashMap; +import java.util.WeakHashMap; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; + +public class ObjectCache { + + private static final Logger LOG = LoggerFactory.getLogger(ObjectCache.class); + + private static final WeakHashMap<Configuration, ObjectCache> CACHE = new WeakHashMap<Configuration, ObjectCache>(); + + private final HashMap<String, Object> objectMap; + + private ObjectCache() { + objectMap = new HashMap<String, Object>(); + } + + public synchronized static ObjectCache get(Configuration conf) { + ObjectCache objectCache = CACHE.get(conf); + if (objectCache == null) { + LOG.debug("No object cache found for conf=" + conf + + ", instantiating a new object cache"); + objectCache = new ObjectCache(); + CACHE.put(conf, objectCache); + } + return objectCache; + } + + public synchronized Object getObject(String key) { + return objectMap.get(key); + } + + public synchronized void setObject(String key, Object value) { + objectMap.put(key, value); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java new file mode 100644 index 0000000..e323b67 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.util.Collection; +import java.util.Iterator; + +/** + * A class for efficiently matching <code>String</code>s against a set of + * prefixes. + */ +public class PrefixStringMatcher extends TrieStringMatcher { + + /** + * Creates a new <code>PrefixStringMatcher</code> which will match + * <code>String</code>s with any prefix in the supplied array. Zero-length + * <code>Strings</code> are ignored. + */ + public PrefixStringMatcher(String[] prefixes) { + super(); + for (int i = 0; i < prefixes.length; i++) + addPatternForward(prefixes[i]); + } + + /** + * Creates a new <code>PrefixStringMatcher</code> which will match + * <code>String</code>s with any prefix in the supplied + * <code>Collection</code>. + * + * @throws ClassCastException + * if any <code>Object</code>s in the collection are not + * <code>String</code>s + */ + public PrefixStringMatcher(Collection<String> prefixes) { + super(); + Iterator<String> iter = prefixes.iterator(); + while (iter.hasNext()) + addPatternForward(iter.next()); + } + + /** + * Returns true if the given <code>String</code> is matched by a prefix in the + * trie + */ + public boolean matches(String input) { + TrieNode node = root; + for (int i = 0; i < input.length(); i++) { + node = node.getChild(input.charAt(i)); + if (node == null) + return false; + if (node.isTerminal()) + return true; + } + return false; + } + + /** + * Returns the shortest prefix of <code>input<code> that is matched, + * or <code>null<code> if no match exists. + */ + public String shortestMatch(String input) { + TrieNode node = root; + for (int i = 0; i < input.length(); i++) { + node = node.getChild(input.charAt(i)); + if (node == null) + return null; + if (node.isTerminal()) + return input.substring(0, i + 1); + } + return null; + } + + /** + * Returns the longest prefix of <code>input<code> that is matched, + * or <code>null<code> if no match exists. + */ + public String longestMatch(String input) { + TrieNode node = root; + String result = null; + for (int i = 0; i < input.length(); i++) { + node = node.getChild(input.charAt(i)); + if (node == null) + break; + if (node.isTerminal()) + result = input.substring(0, i + 1); + } + return result; + } + + public static final void main(String[] argv) { + PrefixStringMatcher matcher = new PrefixStringMatcher(new String[] { + "abcd", "abc", "aac", "baz", "foo", "foobar" }); + + String[] tests = { "a", "ab", "abc", "abcdefg", "apple", "aa", "aac", + "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", }; + + for (int i = 0; i < tests.length; i++) { + System.out.println("testing: " + tests[i]); + System.out.println(" matches: " + matcher.matches(tests[i])); + System.out.println(" shortest: " + matcher.shortestMatch(tests[i])); + System.out.println(" longest: " + matcher.longestMatch(tests[i])); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java new file mode 100644 index 0000000..d26cbfc --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java @@ -0,0 +1,179 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.text.SimpleDateFormat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.TimingUtil; +import org.apache.nutch.metadata.Nutch; + +/** + * Extracts protocol status code information from the crawl database. + * + * ProtocolStatusStatistics will give you information on the count + * of all status codes encountered on your crawl. This can be useful + * for checking a number of things. + * + * An example output run showing the number of encountered status + * codes such as 200, 300, and a count of un-fetched record. + * + * 38 200 + * 19 301 + * 2 302 + * 665 UNFETCHED + * + */ +public class ProtocolStatusStatistics extends Configured implements Tool { + + private static final Logger LOG = LoggerFactory + .getLogger(ProtocolStatusStatistics.class); + + private static final Text UNFETCHED_TEXT = new Text("UNFETCHED"); + + public static Configuration conf; + + public int run(String[] args) throws Exception { + if (args.length < 2) { + System.err.println("Usage: ProtocolStatistics inputDirs outDir [numOfReducer]"); + + System.err.println("\tinputDirs\tComma separated list of crawldb input directories"); + System.err.println("\t\t\tE.g.: crawl/crawldb/"); + + System.err.println("\toutDir\t\tOutput directory where results should be dumped"); + + System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1."); + return 1; + } + String inputDir = args[0]; + String outputDir = args[1]; + + int numOfReducers = 1; + + if (args.length > 3) { + numOfReducers = Integer.parseInt(args[3]); + } + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + long start = System.currentTimeMillis(); + LOG.info("ProtocolStatistics: starting at " + sdf.format(start)); + + String jobName = "ProtocolStatistics"; + + conf = getConf(); + conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); + + Job job = Job.getInstance(conf, jobName); + job.setJarByClass(ProtocolStatusStatistics.class); + + String[] inputDirsSpecs = inputDir.split(","); + for (int i = 0; i < inputDirsSpecs.length; i++) { + File completeInputPath = new File(new File(inputDirsSpecs[i]), "current"); + FileInputFormat.addInputPath(job, new Path(completeInputPath.toString())); + } + + job.setInputFormatClass(SequenceFileInputFormat.class); + FileOutputFormat.setOutputPath(job, new Path(outputDir)); + job.setOutputFormatClass(TextOutputFormat.class); + + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(LongWritable.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(LongWritable.class); + + job.setMapperClass(ProtocolStatusStatisticsMapper.class); + job.setReducerClass(ProtocolStatusStatisticsReducer.class); + job.setCombinerClass(ProtocolStatusStatisticsCombiner.class); + job.setNumReduceTasks(numOfReducers); + + try { + job.waitForCompletion(true); + } catch (Exception e) { + throw e; + } + + long end = System.currentTimeMillis(); + LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); + return 0; + } + + static class ProtocolStatusStatisticsMapper extends + Mapper<Text, CrawlDatum, Text, LongWritable> { + + public void map(Text urlText, CrawlDatum datum, Context context) + throws IOException, InterruptedException { + if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) { + context.write((Text) datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY), new LongWritable(1)); + } else { + context.write(UNFETCHED_TEXT, new LongWritable(1)); + } + } + } + + static class ProtocolStatusStatisticsReducer extends + Reducer<Text, LongWritable, LongWritable, Text> { + public void reduce(Text key, Iterable<LongWritable> values, Context context) + throws IOException, InterruptedException { + long total = 0; + + for (LongWritable val : values) { + total += val.get(); + } + + context.write(new LongWritable(total), key); + } + } + + public static class ProtocolStatusStatisticsCombiner extends + Reducer<Text, LongWritable, Text, LongWritable> { + public void reduce(Text key, Iterable<LongWritable> values, Context context) + throws IOException, InterruptedException { + long total = 0; + + for (LongWritable val : values) { + total += val.get(); + } + context.write(key, new LongWritable(total)); + } + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(NutchConfiguration.create(), new ProtocolStatusStatistics(), args); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java new file mode 100644 index 0000000..149269f --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java @@ -0,0 +1,155 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +/** + * A collection of String processing utility methods. + */ +public class StringUtil { + + /** + * Returns a copy of <code>s</code> padded with trailing spaces so that it's + * length is <code>length</code>. Strings already <code>length</code> + * characters long or longer are not altered. + */ + public static String rightPad(String s, int length) { + StringBuffer sb = new StringBuffer(s); + for (int i = length - s.length(); i > 0; i--) + sb.append(" "); + return sb.toString(); + } + + /** + * Returns a copy of <code>s</code> padded with leading spaces so that it's + * length is <code>length</code>. Strings already <code>length</code> + * characters long or longer are not altered. + */ + public static String leftPad(String s, int length) { + StringBuffer sb = new StringBuffer(); + for (int i = length - s.length(); i > 0; i--) + sb.append(" "); + sb.append(s); + return sb.toString(); + } + + private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5', '6', + '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; + + /** + * Convenience call for {@link #toHexString(byte[], String, int)}, where + * <code>sep = null; lineLen = Integer.MAX_VALUE</code>. + * + * @param buf + */ + public static String toHexString(byte[] buf) { + return toHexString(buf, null, Integer.MAX_VALUE); + } + + /** + * Get a text representation of a byte[] as hexadecimal String, where each + * pair of hexadecimal digits corresponds to consecutive bytes in the array. + * + * @param buf + * input data + * @param sep + * separate every pair of hexadecimal digits with this separator, or + * null if no separation is needed. + * @param lineLen + * break the output String into lines containing output for lineLen + * bytes. + */ + public static String toHexString(byte[] buf, String sep, int lineLen) { + if (buf == null) + return null; + if (lineLen <= 0) + lineLen = Integer.MAX_VALUE; + StringBuffer res = new StringBuffer(buf.length * 2); + for (int i = 0; i < buf.length; i++) { + int b = buf[i]; + res.append(HEX_DIGITS[(b >> 4) & 0xf]); + res.append(HEX_DIGITS[b & 0xf]); + if (i > 0 && (i % lineLen) == 0) + res.append('\n'); + else if (sep != null && i < lineLen - 1) + res.append(sep); + } + return res.toString(); + } + + /** + * Convert a String containing consecutive (no inside whitespace) hexadecimal + * digits into a corresponding byte array. If the number of digits is not + * even, a '0' will be appended in the front of the String prior to + * conversion. Leading and trailing whitespace is ignored. + * + * @param text + * input text + * @return converted byte array, or null if unable to convert + */ + public static byte[] fromHexString(String text) { + text = text.trim(); + if (text.length() % 2 != 0) + text = "0" + text; + int resLen = text.length() / 2; + int loNibble, hiNibble; + byte[] res = new byte[resLen]; + for (int i = 0; i < resLen; i++) { + int j = i << 1; + hiNibble = charToNibble(text.charAt(j)); + loNibble = charToNibble(text.charAt(j + 1)); + if (loNibble == -1 || hiNibble == -1) + return null; + res[i] = (byte) (hiNibble << 4 | loNibble); + } + return res; + } + + private static final int charToNibble(char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } else if (c >= 'a' && c <= 'f') { + return 0xa + (c - 'a'); + } else if (c >= 'A' && c <= 'F') { + return 0xA + (c - 'A'); + } else { + return -1; + } + } + + /** + * Checks if a string is empty (ie is null or empty). + */ + public static boolean isEmpty(String str) { + return (str == null) || (str.equals("")); + } + + /** + * Simple character substitution which cleans all � chars from a given String. + */ + public static String cleanField(String value) { + return value.replaceAll("�", ""); + } + + public static void main(String[] args) { + if (args.length != 1) + System.out.println("Usage: StringUtil <encoding name>"); + else + System.out.println(args[0] + " is resolved to " + + EncodingDetector.resolveEncodingAlias(args[0])); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java new file mode 100644 index 0000000..a967c01 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java @@ -0,0 +1,114 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.util.Collection; +import java.util.Iterator; + +/** + * A class for efficiently matching <code>String</code>s against a set of + * suffixes. Zero-length <code>Strings</code> are ignored. + */ +public class SuffixStringMatcher extends TrieStringMatcher { + + /** + * Creates a new <code>PrefixStringMatcher</code> which will match + * <code>String</code>s with any suffix in the supplied array. + */ + public SuffixStringMatcher(String[] suffixes) { + super(); + for (int i = 0; i < suffixes.length; i++) + addPatternBackward(suffixes[i]); + } + + /** + * Creates a new <code>PrefixStringMatcher</code> which will match + * <code>String</code>s with any suffix in the supplied + * <code>Collection</code> + */ + public SuffixStringMatcher(Collection<String> suffixes) { + super(); + Iterator<String> iter = suffixes.iterator(); + while (iter.hasNext()) + addPatternBackward(iter.next()); + } + + /** + * Returns true if the given <code>String</code> is matched by a suffix in the + * trie + */ + public boolean matches(String input) { + TrieNode node = root; + for (int i = input.length() - 1; i >= 0; i--) { + node = node.getChild(input.charAt(i)); + if (node == null) + return false; + if (node.isTerminal()) + return true; + } + return false; + } + + /** + * Returns the shortest suffix of <code>input<code> that is matched, + * or <code>null<code> if no match exists. + */ + public String shortestMatch(String input) { + TrieNode node = root; + for (int i = input.length() - 1; i >= 0; i--) { + node = node.getChild(input.charAt(i)); + if (node == null) + return null; + if (node.isTerminal()) + return input.substring(i); + } + return null; + } + + /** + * Returns the longest suffix of <code>input<code> that is matched, + * or <code>null<code> if no match exists. + */ + public String longestMatch(String input) { + TrieNode node = root; + String result = null; + for (int i = input.length() - 1; i >= 0; i--) { + node = node.getChild(input.charAt(i)); + if (node == null) + break; + if (node.isTerminal()) + result = input.substring(i); + } + return result; + } + + public static final void main(String[] argv) { + SuffixStringMatcher matcher = new SuffixStringMatcher(new String[] { "a", + "abcd", "bcd", "bcdefg", "defg", "aac", "baz", "foo", "foobar" }); + + String[] tests = { "a", "ac", "abcd", "abcdefg", "apple", "aa", "aac", + "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", }; + + for (int i = 0; i < tests.length; i++) { + System.out.println("testing: " + tests[i]); + System.out.println(" matches: " + matcher.matches(tests[i])); + System.out.println(" shortest: " + matcher.shortestMatch(tests[i])); + System.out.println(" longest: " + matcher.longestMatch(tests[i])); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java new file mode 100644 index 0000000..68ded69 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java @@ -0,0 +1,161 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.nutch.util; + +import org.apache.commons.lang.StringUtils; + +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.ByteBuffer; + +public class TableUtil { + + public static final ByteBuffer YES_VAL = ByteBuffer.wrap(new byte[] { 'y' }); + + /** + * Reverses a url's domain. This form is better for storing in hbase. Because + * scans within the same domain are faster. + * <p> + * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes + * "com.foo.bar:8983:http/to/index.html?a=b". + * + * @param url + * url to be reversed + * @return Reversed url + * @throws MalformedURLException + */ + public static String reverseUrl(String urlString) + throws MalformedURLException { + return reverseUrl(new URL(urlString)); + } + + /** + * Reverses a url's domain. This form is better for storing in hbase. Because + * scans within the same domain are faster. + * <p> + * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes + * "com.foo.bar:http:8983/to/index.html?a=b". + * + * @param url + * url to be reversed + * @return Reversed url + */ + public static String reverseUrl(URL url) { + String host = url.getHost(); + String file = url.getFile(); + String protocol = url.getProtocol(); + int port = url.getPort(); + + StringBuilder buf = new StringBuilder(); + + /* reverse host */ + reverseAppendSplits(host, buf); + + /* add protocol */ + buf.append(':'); + buf.append(protocol); + + /* add port if necessary */ + if (port != -1) { + buf.append(':'); + buf.append(port); + } + + /* add path */ + if (file.length() > 0 && '/' != file.charAt(0)) { + buf.append('/'); + } + buf.append(file); + + return buf.toString(); + } + + public static String unreverseUrl(String reversedUrl) { + StringBuilder buf = new StringBuilder(reversedUrl.length() + 2); + + int pathBegin = reversedUrl.indexOf('/'); + if (pathBegin == -1) + pathBegin = reversedUrl.length(); + String sub = reversedUrl.substring(0, pathBegin); + + String[] splits = StringUtils.splitPreserveAllTokens(sub, ':'); // {<reversed + // host>, + // <port>, + // <protocol>} + + buf.append(splits[1]); // add protocol + buf.append("://"); + reverseAppendSplits(splits[0], buf); // splits[0] is reversed + // host + if (splits.length == 3) { // has a port + buf.append(':'); + buf.append(splits[2]); + } + buf.append(reversedUrl.substring(pathBegin)); + return buf.toString(); + } + + /** + * Given a reversed url, returns the reversed host E.g + * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar" + * + * @param reversedUrl + * Reversed url + * @return Reversed host + */ + public static String getReversedHost(String reversedUrl) { + return reversedUrl.substring(0, reversedUrl.indexOf(':')); + } + + private static void reverseAppendSplits(String string, StringBuilder buf) { + String[] splits = StringUtils.split(string, '.'); + if (splits.length > 0) { + for (int i = splits.length - 1; i > 0; i--) { + buf.append(splits[i]); + buf.append('.'); + } + buf.append(splits[0]); + } else { + buf.append(string); + } + } + + public static String reverseHost(String hostName) { + StringBuilder buf = new StringBuilder(); + reverseAppendSplits(hostName, buf); + return buf.toString(); + + } + + public static String unreverseHost(String reversedHostName) { + return reverseHost(reversedHostName); // Reversible + } + + /** + * Convert given Utf8 instance to String and and cleans out any offending "�" + * from the String. + * + * + * @param utf8 + * Utf8 object + * @return string-ifed Utf8 object or null if Utf8 instance is null + */ + public static String toString(CharSequence utf8) { + return (utf8 == null ? null : StringUtil.cleanField(utf8.toString())); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java new file mode 100644 index 0000000..c4af356 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.util.concurrent.TimeUnit; + +public class TimingUtil { + + /** + * Calculate the elapsed time between two times specified in milliseconds. + * + * @param start + * The start of the time period + * @param end + * The end of the time period + * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y + * minutes and Z seconds or null if start > end. + */ + public static String elapsedTime(long start, long end) { + if (start > end) { + return null; + } + return secondsToHMS((end-start)/1000); + } + + /** + * Show time in seconds as hours, minutes and seconds (hh:mm:ss) + * + * @param seconds + * (elapsed) time in seconds + * @return human readable time string "hh:mm:ss" + */ + public static String secondsToHMS(long seconds) { + long hours = TimeUnit.SECONDS.toHours(seconds); + long minutes = TimeUnit.SECONDS.toMinutes(seconds) + % TimeUnit.HOURS.toMinutes(1); + seconds = TimeUnit.SECONDS.toSeconds(seconds) + % TimeUnit.MINUTES.toSeconds(1); + return String.format("%02d:%02d:%02d", hours, minutes, seconds); + } + + /** + * Show time in seconds as days, hours, minutes and seconds (d days, hh:mm:ss) + * + * @param seconds + * (elapsed) time in seconds + * @return human readable time string "d days, hh:mm:ss" + */ + public static String secondsToDaysHMS(long seconds) { + long days = TimeUnit.SECONDS.toDays(seconds); + if (days == 0) + return secondsToHMS(seconds); + String hhmmss = secondsToHMS(seconds % TimeUnit.DAYS.toSeconds(1)); + return String.format("%d days, %s", days, hhmmss); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java new file mode 100644 index 0000000..95f06ad --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java @@ -0,0 +1,202 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.ListIterator; + +/** + * TrieStringMatcher is a base class for simple tree-based string matching. + * + */ +public abstract class TrieStringMatcher { + protected TrieNode root; + + protected TrieStringMatcher() { + this.root = new TrieNode('\000', false); + } + + /** + * Node class for the character tree. + */ + protected class TrieNode implements Comparable<TrieNode> { + protected TrieNode[] children; + protected LinkedList<TrieNode> childrenList; + protected char nodeChar; + protected boolean terminal; + + /** + * Creates a new TrieNode, which contains the given <code>nodeChar</code>. + * If <code>isTerminal</code> is <code>true</code>, the new node is a + * <em>terminal</em> node in the trie. + */ + TrieNode(char nodeChar, boolean isTerminal) { + this.nodeChar = nodeChar; + this.terminal = isTerminal; + this.childrenList = new LinkedList<TrieNode>(); + } + + /** + * Returns <code>true</code> if this node is a <em>terminal</em> node in the + * trie. + */ + boolean isTerminal() { + return terminal; + } + + /** + * Returns the child node of this node whose node-character is + * <code>nextChar</code>. If no such node exists, one will be is added. If + * <em>isTerminal</em> is <code>true</code>, the node will be a terminal + * node in the trie. + */ + TrieNode getChildAddIfNotPresent(char nextChar, boolean isTerminal) { + if (childrenList == null) { + childrenList = new LinkedList<TrieNode>(); + childrenList.addAll(Arrays.asList(children)); + children = null; + } + + if (childrenList.size() == 0) { + TrieNode newNode = new TrieNode(nextChar, isTerminal); + childrenList.add(newNode); + return newNode; + } + + ListIterator<TrieNode> iter = childrenList.listIterator(); + TrieNode node = iter.next(); + while ((node.nodeChar < nextChar) && iter.hasNext()) + node = iter.next(); + + if (node.nodeChar == nextChar) { + node.terminal = node.terminal | isTerminal; + return node; + } + + if (node.nodeChar > nextChar) + iter.previous(); + + TrieNode newNode = new TrieNode(nextChar, isTerminal); + iter.add(newNode); + return newNode; + } + + /** + * Returns the child node of this node whose node-character is + * <code>nextChar</code>. If no such node exists, <code>null</code> is + * returned. + */ + TrieNode getChild(char nextChar) { + if (children == null) { + children = childrenList.toArray(new TrieNode[childrenList.size()]); + childrenList = null; + Arrays.sort(children); + } + + int min = 0; + int max = children.length - 1; + int mid = 0; + while (min < max) { + mid = (min + max) / 2; + if (children[mid].nodeChar == nextChar) + return children[mid]; + if (children[mid].nodeChar < nextChar) + min = mid + 1; + else + // if (children[mid].nodeChar > nextChar) + max = mid - 1; + } + + if (min == max) + if (children[min].nodeChar == nextChar) + return children[min]; + + return null; + } + + public int compareTo(TrieNode other) { + if (this.nodeChar < other.nodeChar) + return -1; + if (this.nodeChar == other.nodeChar) + return 0; + // if (this.nodeChar > other.nodeChar) + return 1; + } + } + + /** + * Returns the next {@link TrieNode} visited, given that you are at + * <code>node</code>, and the the next character in the input is the + * <code>idx</code>'th character of <code>s</code>. + */ + protected final TrieNode matchChar(TrieNode node, String s, int idx) { + return node.getChild(s.charAt(idx)); + } + + /** + * Adds any necessary nodes to the trie so that the given <code>String</code> + * can be decoded and the last character is represented by a terminal node. + * Zero-length <code>Strings</code> are ignored. + */ + protected final void addPatternForward(String s) { + TrieNode node = root; + int stop = s.length() - 1; + int i; + if (s.length() > 0) { + for (i = 0; i < stop; i++) + node = node.getChildAddIfNotPresent(s.charAt(i), false); + node = node.getChildAddIfNotPresent(s.charAt(i), true); + } + } + + /** + * Adds any necessary nodes to the trie so that the given <code>String</code> + * can be decoded <em>in reverse</em> and the first character is represented + * by a terminal node. Zero-length <code>Strings</code> are ignored. + */ + protected final void addPatternBackward(String s) { + TrieNode node = root; + if (s.length() > 0) { + for (int i = s.length() - 1; i > 0; i--) + node = node.getChildAddIfNotPresent(s.charAt(i), false); + node = node.getChildAddIfNotPresent(s.charAt(0), true); + } + } + + /** + * Returns true if the given <code>String</code> is matched by a pattern in + * the trie + */ + public abstract boolean matches(String input); + + /** + * Returns the shortest substring of <code>input<code> that is + * matched by a pattern in the trie, or <code>null<code> if no match + * exists. + */ + public abstract String shortestMatch(String input); + + /** + * Returns the longest substring of <code>input<code> that is + * matched by a pattern in the trie, or <code>null<code> if no match + * exists. + */ + public abstract String longestMatch(String input); + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java new file mode 100644 index 0000000..3e696cb --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java @@ -0,0 +1,533 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.net.MalformedURLException; +import java.net.*; +import java.util.regex.Pattern; + +import org.apache.nutch.util.domain.DomainSuffix; +import org.apache.nutch.util.domain.DomainSuffixes; + +/** Utility class for URL analysis */ +public class URLUtil { + + /** + * Resolve relative URL-s and fix a java.net.URL error in handling of URLs + * with pure query targets. + * + * @param base + * base url + * @param target + * target url (may be relative) + * @return resolved absolute url. + * @throws MalformedURLException + */ + public static URL resolveURL(URL base, String target) + throws MalformedURLException { + target = target.trim(); + + // handle the case that there is a target that is a pure query, + // for example + // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0 + // It has urls in the page of the form href="?co=0&sk=0&pg=1", and by + // default + // URL constructs the base+target combo as + // http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly + // dropping the Search.aspx target + // + // Browsers handle these just fine, they must have an exception similar to + // this + if (target.startsWith("?")) { + return fixPureQueryTargets(base, target); + } + + return new URL(base, target); + } + + /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */ + static URL fixPureQueryTargets(URL base, String target) + throws MalformedURLException { + if (!target.startsWith("?")) + return new URL(base, target); + + String basePath = base.getPath(); + String baseRightMost = ""; + int baseRightMostIdx = basePath.lastIndexOf("/"); + if (baseRightMostIdx != -1) { + baseRightMost = basePath.substring(baseRightMostIdx + 1); + } + + if (target.startsWith("?")) + target = baseRightMost + target; + + return new URL(base, target); + } + + private static Pattern IP_PATTERN = Pattern + .compile("(\\d{1,3}\\.){3}(\\d{1,3})"); + + /** + * Returns the domain name of the url. The domain name of a url is the + * substring of the url's hostname, w/o subdomain names. As an example <br> + * <code> + * getDomainName(conf, new URL(http://lucene.apache.org/)) + * </code><br> + * will return <br> + * <code> apache.org</code> + * */ + public static String getDomainName(URL url) { + DomainSuffixes tlds = DomainSuffixes.getInstance(); + String host = url.getHost(); + // it seems that java returns hostnames ending with . + if (host.endsWith(".")) + host = host.substring(0, host.length() - 1); + if (IP_PATTERN.matcher(host).matches()) + return host; + + int index = 0; + String candidate = host; + for (; index >= 0;) { + index = candidate.indexOf('.'); + String subCandidate = candidate.substring(index + 1); + if (tlds.isDomainSuffix(subCandidate)) { + return candidate; + } + candidate = subCandidate; + } + return candidate; + } + + /** + * Returns the domain name of the url. The domain name of a url is the + * substring of the url's hostname, w/o subdomain names. As an example <br> + * <code> + * getDomainName(conf, new http://lucene.apache.org/) + * </code><br> + * will return <br> + * <code> apache.org</code> + * + * @throws MalformedURLException + */ + public static String getDomainName(String url) throws MalformedURLException { + return getDomainName(new URL(url)); + } + + /** + * Returns the top level domain name of the url. The top level domain name of + * a url is the substring of the url's hostname, w/o subdomain names. As an + * example <br> + * <code> + * getTopLevelDomainName(conf, new http://lucene.apache.org/) + * </code><br> + * will return <br> + * <code> org</code> + * + * @throws MalformedURLException + */ + public static String getTopLevelDomainName(URL url) + throws MalformedURLException { + String suffix = getDomainSuffix(url).toString(); + int idx = suffix.lastIndexOf("."); + if (idx != -1) { + return suffix.substring(idx + 1); + } else { + return suffix; + } + } + + /** + * Returns the top level domain name of the url. The top level domain name of + * a url is the substring of the url's hostname, w/o subdomain names. As an + * example <br> + * <code> + * getTopLevelDomainName(conf, new http://lucene.apache.org/) + * </code><br> + * will return <br> + * <code> org</code> + * + * @throws MalformedURLException + */ + public static String getTopLevelDomainName(String url) + throws MalformedURLException { + return getTopLevelDomainName(new URL(url)); + } + + /** + * Returns whether the given urls have the same domain name. As an example, <br> + * <code> isSameDomain(new URL("http://lucene.apache.org") + * , new URL("http://people.apache.org/")) + * <br> will return true. </code> + * + * @return true if the domain names are equal + */ + public static boolean isSameDomainName(URL url1, URL url2) { + return getDomainName(url1).equalsIgnoreCase(getDomainName(url2)); + } + + /** + * Returns whether the given urls have the same domain name. As an example, <br> + * <code> isSameDomain("http://lucene.apache.org" + * ,"http://people.apache.org/") + * <br> will return true. </code> + * + * @return true if the domain names are equal + * @throws MalformedURLException + */ + public static boolean isSameDomainName(String url1, String url2) + throws MalformedURLException { + return isSameDomainName(new URL(url1), new URL(url2)); + } + + /** + * Returns the {@link DomainSuffix} corresponding to the last public part of + * the hostname + */ + public static DomainSuffix getDomainSuffix(URL url) { + DomainSuffixes tlds = DomainSuffixes.getInstance(); + String host = url.getHost(); + if (IP_PATTERN.matcher(host).matches()) + return null; + + int index = 0; + String candidate = host; + for (; index >= 0;) { + index = candidate.indexOf('.'); + String subCandidate = candidate.substring(index + 1); + DomainSuffix d = tlds.get(subCandidate); + if (d != null) { + return d; + } + candidate = subCandidate; + } + return null; + } + + /** + * Returns the {@link DomainSuffix} corresponding to the last public part of + * the hostname + */ + public static DomainSuffix getDomainSuffix(String url) + throws MalformedURLException { + return getDomainSuffix(new URL(url)); + } + + /** Partitions of the hostname of the url by "." */ + public static String[] getHostSegments(URL url) { + String host = url.getHost(); + // return whole hostname, if it is an ipv4 + // TODO : handle ipv6 + if (IP_PATTERN.matcher(host).matches()) + return new String[] { host }; + return host.split("\\."); + } + + /** + * Partitions of the hostname of the url by "." + * + * @throws MalformedURLException + */ + public static String[] getHostSegments(String url) + throws MalformedURLException { + return getHostSegments(new URL(url)); + } + + /** + * <p> + * Given two urls, a src and a destination of a redirect, it returns the + * representative url. + * <p> + * + * <p> + * This method implements an extended version of the algorithm used by the + * Yahoo! Slurp crawler described here:<br> + * <a href= + * "http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> How + * does the Yahoo! webcrawler handle redirects?</a> <br> + * <br> + * <ol> + * <li>Choose target url if either url is malformed.</li> + * <li>If different domains the keep the destination whether or not the + * redirect is temp or perm</li> + * <ul> + * <li>a.com -> b.com*</li> + * </ul> + * <li>If the redirect is permanent and the source is root, keep the source.</li> + * <ul> + * <li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li> + * </ul> + * <li>If the redirect is permanent and the source is not root and the + * destination is root, keep the destination</li> + * <ul> + * <li>a.com/xyz/index.html -> a.com*</li> + * </ul> + * <li>If the redirect is permanent and neither the source nor the destination + * is root, then keep the destination</li> + * <ul> + * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li> + * </ul> + * <li>If the redirect is temporary and source is root and destination is not + * root, then keep the source</li> + * <ul> + * <li>*a.com -> a.com/xyz/index.html</li> + * </ul> + * <li>If the redirect is temporary and source is not root and destination is + * root, then keep the destination</li> + * <ul> + * <li>a.com/xyz/index.html -> a.com*</li> + * </ul> + * <li>If the redirect is temporary and neither the source or the destination + * is root, then keep the shortest url. First check for the shortest host, and + * if both are equal then check by path. Path is first by length then by the + * number of / path separators.</li> + * <ul> + * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li> + * <li>*www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html</li> + * </ul> + * <li>If the redirect is temporary and both the source and the destination + * are root, then keep the shortest sub-domain</li> + * <ul> + * <li>*www.a.com -> www.news.a.com</li> + * </ul> + * <br> + * While not in this logic there is a further piece of representative url + * logic that occurs during indexing and after scoring. During creation of the + * basic fields before indexing, if a url has a representative url stored we + * check both the url and its representative url (which should never be the + * same) against their linkrank scores and the highest scoring one is kept as + * the url and the lower scoring one is held as the orig url inside of the + * index. + * + * @param src + * The source url. + * @param dst + * The destination url. + * @param temp + * Is the redirect a temporary redirect. + * + * @return String The representative url. + */ + public static String chooseRepr(String src, String dst, boolean temp) { + + // validate both are well formed urls + URL srcUrl; + URL dstUrl; + try { + srcUrl = new URL(src); + dstUrl = new URL(dst); + } catch (MalformedURLException e) { + return dst; + } + + // get the source and destination domain, host, and page + String srcDomain = URLUtil.getDomainName(srcUrl); + String dstDomain = URLUtil.getDomainName(dstUrl); + String srcHost = srcUrl.getHost(); + String dstHost = dstUrl.getHost(); + String srcFile = srcUrl.getFile(); + String dstFile = dstUrl.getFile(); + + // are the source and destination the root path url.com/ or url.com + boolean srcRoot = (srcFile.equals("/") || srcFile.length() == 0); + boolean destRoot = (dstFile.equals("/") || dstFile.length() == 0); + + // 1) different domain them keep dest, temp or perm + // a.com -> b.com* + // + // 2) permanent and root, keep src + // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html + // + // 3) permanent and not root and dest root, keep dest + // a.com/xyz/index.html -> a.com* + // + // 4) permanent and neither root keep dest + // a.com/xyz/index.html -> a.com/abc/page.html* + // + // 5) temp and root and dest not root keep src + // *a.com -> a.com/xyz/index.html + // + // 7) temp and not root and dest root keep dest + // a.com/xyz/index.html -> a.com* + // + // 8) temp and neither root, keep shortest, if hosts equal by path else by + // hosts. paths are first by length then by number of / separators + // a.com/xyz/index.html -> a.com/abc/page.html* + // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html + // + // 9) temp and both root keep shortest sub domain + // *www.a.com -> www.news.a.com + + // if we are dealing with a redirect from one domain to another keep the + // destination + if (!srcDomain.equals(dstDomain)) { + return dst; + } + + // if it is a permanent redirect + if (!temp) { + + // if source is root return source, otherwise destination + if (srcRoot) { + return src; + } else { + return dst; + } + } else { // temporary redirect + + // source root and destination not root + if (srcRoot && !destRoot) { + return src; + } else if (!srcRoot && destRoot) { // destination root and source not + return dst; + } else if (!srcRoot && !destRoot && (srcHost.equals(dstHost))) { + + // source and destination hosts are the same, check paths, host length + int numSrcPaths = srcFile.split("/").length; + int numDstPaths = dstFile.split("/").length; + if (numSrcPaths != numDstPaths) { + return (numDstPaths < numSrcPaths ? dst : src); + } else { + int srcPathLength = srcFile.length(); + int dstPathLength = dstFile.length(); + return (dstPathLength < srcPathLength ? dst : src); + } + } else { + + // different host names and both root take the shortest + int numSrcSubs = srcHost.split("\\.").length; + int numDstSubs = dstHost.split("\\.").length; + return (numDstSubs < numSrcSubs ? dst : src); + } + } + } + + /** + * Returns the lowercased hostname for the url or null if the url is not well + * formed. + * + * @param url + * The url to check. + * @return String The hostname for the url. + */ + public static String getHost(String url) { + try { + return new URL(url).getHost().toLowerCase(); + } catch (MalformedURLException e) { + return null; + } + } + + /** + * Returns the page for the url. The page consists of the protocol, host, and + * path, but does not include the query string. The host is lowercased but the + * path is not. + * + * @param url + * The url to check. + * @return String The page for the url. + */ + public static String getPage(String url) { + try { + // get the full url, and replace the query string with and empty string + url = url.toLowerCase(); + String queryStr = new URL(url).getQuery(); + return (queryStr != null) ? url.replace("?" + queryStr, "") : url; + } catch (MalformedURLException e) { + return null; + } + } + + public static String getProtocol(String url) { + try { + return getProtocol(new URL(url)); + } catch (Exception e) { + return null; + } + } + + public static String getProtocol(URL url) { + return url.getProtocol(); + } + + public static String toASCII(String url) { + try { + URL u = new URL(url); + String host = u.getHost(); + if (host == null || host.isEmpty()) { + // no host name => no punycoded domain name + // also do not add additional slashes for file: URLs (NUTCH-1880) + return url; + } + URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host), + u.getPort(), u.getPath(), u.getQuery(), u.getRef()); + + return p.toString(); + } catch (Exception e) { + return null; + } + } + + public static String toUNICODE(String url) { + try { + URL u = new URL(url); + String host = u.getHost(); + if (host == null || host.isEmpty()) { + // no host name => no punycoded domain name + // also do not add additional slashes for file: URLs (NUTCH-1880) + return url; + } + StringBuilder sb = new StringBuilder(); + sb.append(u.getProtocol()); + sb.append("://"); + if (u.getUserInfo() != null) { + sb.append(u.getUserInfo()); + sb.append('@'); + } + sb.append(IDN.toUnicode(host)); + if (u.getPort() != -1) { + sb.append(':'); + sb.append(u.getPort()); + } + sb.append(u.getFile()); // includes query + if (u.getRef() != null) { + sb.append('#'); + sb.append(u.getRef()); + } + + return sb.toString(); + } catch (Exception e) { + return null; + } + } + + /** For testing */ + public static void main(String[] args) { + + if (args.length != 1) { + System.err.println("Usage : URLUtil <url>"); + return; + } + + String url = args[0]; + try { + System.out.println(URLUtil.getDomainName(new URL(url))); + } catch (MalformedURLException ex) { + ex.printStackTrace(); + } + } +}
