http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java new file mode 100644 index 0000000..13064eb --- /dev/null +++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.collection; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.StringTokenizer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.nutch.net.URLFilter; +import org.apache.xerces.util.DOMUtil; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; + +/** + * SubCollection represents a subset of index, you can define url patterns that + * will indicate that particular page (url) is part of SubCollection. + */ +public class Subcollection extends Configured implements URLFilter { + + public static final String TAG_COLLECTIONS = "subcollections"; + public static final String TAG_COLLECTION = "subcollection"; + public static final String TAG_WHITELIST = "whitelist"; + public static final String TAG_BLACKLIST = "blacklist"; + public static final String TAG_NAME = "name"; + public static final String TAG_KEY = "key"; + public static final String TAG_ID = "id"; + + List<String> blackList = new ArrayList<String>(); + List<String> whiteList = new ArrayList<String>(); + + /** + * SubCollection identifier + */ + String id; + + /** + * SubCollection key + */ + String key; + + /** + * SubCollection name + */ + String name; + + /** + * SubCollection whitelist as String + */ + String wlString; + + /** + * SubCollection blacklist as String + */ + String blString; + + /** + * public Constructor + * + * @param id + * id of SubCollection + * @param name + * name of SubCollection + */ + public Subcollection(String id, String name, Configuration conf) { + this(id, name, null, conf); + } + + /** + * public Constructor + * + * @param id + * id of SubCollection + * @param name + * name of SubCollection + */ + public Subcollection(String id, String name, String key, Configuration conf) { + this(conf); + this.id = id; + this.key = key; + this.name = name; + } + + public Subcollection(Configuration conf) { + super(conf); + } + + /** + * @return Returns the name + */ + public String getName() { + return name; + } + + /** + * @return Returns the key + */ + public String getKey() { + return key; + } + + /** + * @return Returns the id + */ + public String getId() { + return id; + } + + /** + * Returns whitelist + * + * @return Whitelist entries + */ + public List<String> getWhiteList() { + return whiteList; + } + + /** + * Returns whitelist String + * + * @return Whitelist String + */ + public String getWhiteListString() { + return wlString; + } + + /** + * Returns blacklist String + * + * @return Blacklist String + */ + public String getBlackListString() { + return blString; + } + + /** + * @param whiteList + * The whiteList to set. + */ + public void setWhiteList(ArrayList<String> whiteList) { + this.whiteList = whiteList; + } + + /** + * Simple "indexOf" currentFilter for matching patterns. + * + * <pre> + * rules for evaluation are as follows: + * 1. if pattern matches in blacklist then url is rejected + * 2. if pattern matches in whitelist then url is allowed + * 3. url is rejected + * </pre> + * + * @see org.apache.nutch.net.URLFilter#filter(java.lang.String) + */ + public String filter(String urlString) { + // first the blacklist + Iterator<String> i = blackList.iterator(); + while (i.hasNext()) { + String row = (String) i.next(); + if (urlString.contains(row)) + return null; + } + + // then whitelist + i = whiteList.iterator(); + while (i.hasNext()) { + String row = (String) i.next(); + if (urlString.contains(row)) + return urlString; + } + return null; + } + + /** + * Initialize Subcollection from dom element + * + * @param collection + */ + public void initialize(Element collection) { + this.id = DOMUtil.getChildText( + collection.getElementsByTagName(TAG_ID).item(0)).trim(); + this.name = DOMUtil.getChildText( + collection.getElementsByTagName(TAG_NAME).item(0)).trim(); + this.wlString = DOMUtil.getChildText( + collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim(); + + parseList(this.whiteList, wlString); + + // Check if there's a blacklist we need to parse + NodeList nodeList = collection.getElementsByTagName(TAG_BLACKLIST); + if (nodeList.getLength() > 0) { + this.blString = DOMUtil.getChildText(nodeList.item(0)).trim(); + parseList(this.blackList, blString); + } + + // Check if there's a key element or set default name + nodeList = collection.getElementsByTagName(TAG_KEY); + if (nodeList.getLength() == 1) { + this.key = DOMUtil.getChildText(nodeList.item(0)).trim(); + } + } + + /** + * Create a list of patterns from chunk of text, patterns are separated with + * newline + * + * @param list + * @param text + */ + protected void parseList(List<String> list, String text) { + list.clear(); + + StringTokenizer st = new StringTokenizer(text, "\n\r"); + + while (st.hasMoreElements()) { + String line = (String) st.nextElement(); + list.add(line.trim()); + } + } + + /** + * Set contents of blacklist from String + * + * @param list + * the blacklist contents + */ + public void setBlackList(String list) { + this.blString = list; + parseList(blackList, list); + } + + /** + * Set contents of whitelist from String + * + * @param list + * the whitelist contents + */ + public void setWhiteList(String list) { + this.wlString = list; + parseList(whiteList, list); + } +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html new file mode 100644 index 0000000..be08d1c --- /dev/null +++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html @@ -0,0 +1,36 @@ +<html> +<body> +<p> +Subcollection is a subset of an index. Subcollections are defined +by urlpatterns in form of white/blacklist. So to get the page into +subcollection it must match the whitelist and not the blacklist. +</p> +<p> +Subcollection definitions are read from a file subcollections.xml +and the format is as follows (imagine here that you are crawling all +the virtualhosts from apache.org and you wan't to tag pages with +url pattern "http://lucene.apache.org/nutch" and http://wiki.apache.org/nutch/ +to be part of subcollection "nutch", this allows you to later search +specifically from this subcollection) +</p> +<p/> +<p/> +<pre> +<?xml version="1.0" encoding="UTF-8"?> +<subcollections> + <subcollection> + <name>nutch</name> + <id>lucene</id> + <whitelist>http://lucene.apache.org/nutch</whitelist> + <whitelist>http://wiki.apache.org/nutch/</whitelist> + <blacklist /> + </subcollection> +</subcollections> +</pre> +</p> +<p>Despite of this configuration you still can crawl any urls +as long as they pass through your global url filters. (note that +you must also seed your urls in normal nutch way) +</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java new file mode 100644 index 0000000..2946d9e --- /dev/null +++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.subcollection; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.Text; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.util.NutchConfiguration; + +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; + +import org.apache.nutch.collection.CollectionManager; +import org.apache.nutch.collection.Subcollection; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +public class SubcollectionIndexingFilter extends Configured implements + IndexingFilter { + + private Configuration conf; + + public SubcollectionIndexingFilter() { + super(NutchConfiguration.create()); + } + + public SubcollectionIndexingFilter(Configuration conf) { + super(conf); + } + + /** + * @param Configuration + * conf + */ + public void setConf(Configuration conf) { + this.conf = conf; + fieldName = conf.get("subcollection.default.fieldname", "subcollection"); + } + + /** + * @return Configuration + */ + public Configuration getConf() { + return this.conf; + } + + /** + * Doc field name + */ + public static String fieldName = "subcollection"; + + /** + * Logger + */ + public static final Logger LOG = LoggerFactory + .getLogger(SubcollectionIndexingFilter.class); + + /** + * "Mark" document to be a part of subcollection + * + * @param doc + * @param url + */ + private void addSubCollectionField(NutchDocument doc, String url) { + for (Subcollection coll : CollectionManager.getCollectionManager(getConf()) + .getSubCollections(url)) { + if (coll.getKey() == null) { + doc.add(fieldName, coll.getName()); + } else { + doc.add(coll.getKey(), coll.getName()); + } + } + } + + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + String sUrl = url.toString(); + addSubCollectionField(doc, sUrl); + return doc; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java new file mode 100644 index 0000000..1c6ba72 --- /dev/null +++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Indexing filter to assign documents to subcollections. + * The field "subcollection" is added and filled with a collection name + * defined in a configuration file and selected by pattern, see + * {@link org.apache.nutch.collection}. + */ +package org.apache.nutch.indexer.subcollection; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java b/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java new file mode 100644 index 0000000..a2d2772 --- /dev/null +++ b/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.collection; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.Collection; + +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestSubcollection { + + /** + * Test filtering logic + * + * @throws Exception + */ + @Test + public void testFilter() throws Exception { + Subcollection sc = new Subcollection(NutchConfiguration.create()); + sc.setWhiteList("www.nutch.org\nwww.apache.org"); + sc.setBlackList("jpg\nwww.apache.org/zecret/"); + + // matches whitelist + Assert.assertEquals("http://www.apache.org/index.html", + sc.filter("http://www.apache.org/index.html")); + + // matches blacklist + Assert.assertEquals(null, + sc.filter("http://www.apache.org/zecret/index.html")); + Assert.assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg")); + + // no match + Assert.assertEquals(null, sc.filter("http://www.google.com/")); + } + + @Test + public void testInput() { + StringBuffer xml = new StringBuffer(); + xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); + xml.append("<!-- just a comment -->"); + xml.append("<subcollections>"); + xml.append("<subcollection>"); + xml.append("<name>nutch collection</name>"); + xml.append("<id>nutch</id>"); + xml.append("<whitelist>"); + xml.append("http://lucene.apache.org/nutch/\n"); + xml.append("http://wiki.apache.org/nutch/\n"); + xml.append("</whitelist>"); + xml.append("<blacklist>"); + xml.append("http://www.xxx.yyy\n"); + xml.append("</blacklist>"); + xml.append("</subcollection>"); + xml.append("</subcollections>"); + + InputStream is = new ByteArrayInputStream(xml.toString().getBytes()); + + CollectionManager cm = new CollectionManager(); + cm.parse(is); + + Collection<?> c = cm.getAll(); + + // test that size matches + Assert.assertEquals(1, c.size()); + + Subcollection collection = (Subcollection) c.toArray()[0]; + + // test collection id + Assert.assertEquals("nutch", collection.getId()); + + // test collection name + Assert.assertEquals("nutch collection", collection.getName()); + + // test whitelist + Assert.assertEquals(2, collection.whiteList.size()); + + String wlUrl = (String) collection.whiteList.get(0); + Assert.assertEquals("http://lucene.apache.org/nutch/", wlUrl); + + wlUrl = (String) collection.whiteList.get(1); + Assert.assertEquals("http://wiki.apache.org/nutch/", wlUrl); + + // matches whitelist + Assert.assertEquals("http://lucene.apache.org/nutch/", + collection.filter("http://lucene.apache.org/nutch/")); + + // test blacklist + Assert.assertEquals(1, collection.blackList.size()); + + String blUrl = (String) collection.blackList.get(0); + Assert.assertEquals("http://www.xxx.yyy", blUrl); + + // no match + Assert.assertEquals(null, collection.filter("http://www.google.com/")); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/tld/build.xml b/nutch-plugins/tld/build.xml new file mode 100644 index 0000000..f46c8e6 --- /dev/null +++ b/nutch-plugins/tld/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="tld" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/tld/ivy.xml b/nutch-plugins/tld/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/tld/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/tld/plugin.xml b/nutch-plugins/tld/plugin.xml new file mode 100644 index 0000000..712a34a --- /dev/null +++ b/nutch-plugins/tld/plugin.xml @@ -0,0 +1,51 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="tld" + name="Top Level Domain Plugin" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="tld.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.indexer.tld" + name="Top Level Domain Indexing Filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="TLDIndexingFilter" + class="org.apache.nutch.indexer.tld.TLDIndexingFilter"/> + </extension> + + <extension id="org.apache.nutch.scoring.tld" + name="Top Level Domain Scoring Filter" + point="org.apache.nutch.scoring.ScoringFilter"> + + <implementation id="org.apache.nutch.scoring.tld.TLDScoringFilter" + class="org.apache.nutch.scoring.tld.TLDScoringFilter" /> + </extension> + + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/tld/pom.xml b/nutch-plugins/tld/pom.xml new file mode 100644 index 0000000..95039bd --- /dev/null +++ b/nutch-plugins/tld/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>tld</artifactId> + <packaging>jar</packaging> + + <name>tld</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java new file mode 100644 index 0000000..cd7e194 --- /dev/null +++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.tld; + +import java.net.URL; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.util.URLUtil; +import org.apache.nutch.util.domain.DomainSuffix; + +/** + * Adds the Top level domain extensions to the index + * + * @author Enis Soztutar <[email protected]> + */ +public class TLDIndexingFilter implements IndexingFilter { + public static final Logger LOG = LoggerFactory + .getLogger(TLDIndexingFilter.class); + + private Configuration conf; + + public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + try { + URL url = new URL(urlText.toString()); + DomainSuffix d = URLUtil.getDomainSuffix(url); + + doc.add("tld", d.getDomain()); + + } catch (Exception ex) { + LOG.warn(ex.toString()); + } + + return doc; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html new file mode 100644 index 0000000..75841d9 --- /dev/null +++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>Top Level Domain Indexing plugin.</p><p></p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java new file mode 100644 index 0000000..b7f4963 --- /dev/null +++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java @@ -0,0 +1,114 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.scoring.tld; + +import java.util.List; +import java.util.Collection; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.NutchField; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.scoring.ScoringFilter; +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.util.domain.DomainSuffix; +import org.apache.nutch.util.domain.DomainSuffixes; + +/** + * Scoring filter to boost tlds. + * + * @author Enis Soztutar <[email protected]> + */ +public class TLDScoringFilter implements ScoringFilter { + + private Configuration conf; + private DomainSuffixes tldEntries; + + public TLDScoringFilter() { + tldEntries = DomainSuffixes.getInstance(); + } + + public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException { + + NutchField tlds = doc.getField("tld"); + float boost = 1.0f; + + if (tlds != null) { + for (Object tld : tlds.getValues()) { + DomainSuffix entry = tldEntries.get(tld.toString()); + if (entry != null) + boost *= entry.getBoost(); + } + } + return initScore * boost; + } + + public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl, + ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, + int validCount) throws ScoringFilterException { + return adjust; + } + + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException { + return initSort; + } + + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + } + + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + } + + public void passScoreAfterParsing(Text url, Content content, Parse parse) + throws ScoringFilterException { + } + + public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) + throws ScoringFilterException { + } + + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List<CrawlDatum> inlinked) throws ScoringFilterException { + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { + return adjust; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html new file mode 100644 index 0000000..d05e4b8 --- /dev/null +++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>Top Level Domain Scoring plugin.</p><p></p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/build.xml b/nutch-plugins/urlfilter-automaton/build.xml new file mode 100644 index 0000000..78557fc --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/build.xml @@ -0,0 +1,51 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-automaton" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-regex-filter/*.jar" /> + </fileset> + <pathelement location="${nutch.root}/build/lib-regex-filter/test"/> + </path> + + <!-- Compile test classes for dependencies --> + <target name="deps-test-compile"> + <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample" includes="**/*.rules, **/*.urls"/> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/ivy.xml b/nutch-plugins/urlfilter-automaton/ivy.xml new file mode 100644 index 0000000..7c1968f --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/ivy.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="dk.brics.automaton" name="automaton" rev="1.11-8" conf="*->default" /> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/plugin.xml b/nutch-plugins/urlfilter-automaton/plugin.xml new file mode 100644 index 0000000..d0cc1ef --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/plugin.xml @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-automaton" + name="Automaton URL Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-automaton.jar"> + <export name="*"/> + </library> + <library name="automaton-1.11-8.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-regex-filter"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.automaton" + name="Nutch Automaton URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="AutomatonURLFilter" + class="org.apache.nutch.urlfilter.automaton.AutomatonURLFilter"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/pom.xml b/nutch-plugins/urlfilter-automaton/pom.xml new file mode 100644 index 0000000..898944e --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/pom.xml @@ -0,0 +1,58 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlfilter-automaton</artifactId> + <packaging>jar</packaging> + + <name>urlfilter-automaton</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + <dependencies> + <dependency> + <groupId>dk.brics.automaton</groupId> + <artifactId>automaton</artifactId> + <version>1.11-8</version> + </dependency> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>lib-regex-filter</artifactId> + <version>${project.parent.version}</version> + </dependency> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>lib-regex-filter</artifactId> + <version>${project.parent.version}</version> + <scope>test</scope> + <type>test-jar</type> + </dependency> + + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java new file mode 100644 index 0000000..ae4896d --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.automaton; + +// JDK imports +import java.io.Reader; +import java.io.IOException; +import java.io.StringReader; +import java.util.regex.PatternSyntaxException; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Automaton imports +import dk.brics.automaton.RegExp; +import dk.brics.automaton.RunAutomaton; +import org.apache.nutch.net.*; +import org.apache.nutch.urlfilter.api.RegexRule; +import org.apache.nutch.urlfilter.api.RegexURLFilterBase; + +/** + * RegexURLFilterBase implementation based on the <a + * href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State + * Automata for Java<sup>TM</sup>. + * + * @author Jérôme Charron + * @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> + */ +public class AutomatonURLFilter extends RegexURLFilterBase { + public static final String URLFILTER_AUTOMATON_FILE = "urlfilter.automaton.file"; + public static final String URLFILTER_AUTOMATON_RULES = "urlfilter.automaton.rules"; + + public AutomatonURLFilter() { + super(); + } + + public AutomatonURLFilter(String filename) throws IOException, + PatternSyntaxException { + super(filename); + } + + AutomatonURLFilter(Reader reader) throws IOException, + IllegalArgumentException { + super(reader); + } + + /* + * ----------------------------------- * <implementation:RegexURLFilterBase> * + * ----------------------------------- + */ + + /** + * Rules specified as a config property will override rules specified as a + * config file. + */ + protected Reader getRulesReader(Configuration conf) throws IOException { + String stringRules = conf.get(URLFILTER_AUTOMATON_RULES); + if (stringRules != null) { + return new StringReader(stringRules); + } + String fileRules = conf.get(URLFILTER_AUTOMATON_FILE); + return conf.getConfResourceAsReader(fileRules); + } + + // Inherited Javadoc + protected RegexRule createRule(boolean sign, String regex) { + return new Rule(sign, regex); + } + + protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) { + return new Rule(sign, regex, hostOrDomain); + } + + /* + * ------------------------------------ * </implementation:RegexURLFilterBase> + * * ------------------------------------ + */ + + public static void main(String args[]) throws IOException { + main(new AutomatonURLFilter(), args); + } + + private class Rule extends RegexRule { + + private RunAutomaton automaton; + + Rule(boolean sign, String regex) { + super(sign, regex); + automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton()); + } + + Rule(boolean sign, String regex, String hostOrDomain) { + super(sign, regex, hostOrDomain); + automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton()); + } + + protected boolean match(String url) { + return automaton.run(url); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html new file mode 100644 index 0000000..42533f7 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html @@ -0,0 +1,9 @@ +<html> +<body> +<p> +URL filter plugin based on +<a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State +Automata for Java<sup>TM</sup>. +</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java b/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java new file mode 100644 index 0000000..a70a6b6 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.automaton; + +// JDK imports +import java.io.IOException; +import java.io.Reader; + +import org.apache.nutch.net.*; +// Nutch imports +import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit based test of class <code>AutomatonURLFilter</code>. + * + * @author Jérôme Charron + */ +public class TestAutomatonURLFilter extends RegexURLFilterBaseTest { + + protected URLFilter getURLFilter(Reader rules) { + try { + return new AutomatonURLFilter(rules); + } catch (IOException e) { + Assert.fail(e.toString()); + return null; + } + } + + @Test + public void test() { + test("WholeWebCrawling"); + test("IntranetCrawling"); + bench(50, "Benchmarks"); + bench(100, "Benchmarks"); + bench(200, "Benchmarks"); + bench(400, "Benchmarks"); + bench(800, "Benchmarks"); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules new file mode 100644 index 0000000..a2f6da0 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules @@ -0,0 +1,26 @@ +# The url filter file used by the crawl command. + +# Better for intranet crawling. +# Be sure to change MY.DOMAIN.NAME to your domain name. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-(file|ftp|mailto):.* + +# skip image and other suffixes we can't yet parse +-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png) + +# skip URLs containing certain characters as probable queries, etc. +-.*[?*!@=].* + +# skip .fr .org and .net domains +-.*//.*\.fr/.* +-.*//.*\.org/.* +-.*//.*\.net/.* + +# skip everything else ++.* http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls new file mode 100644 index 0000000..40bf4ee --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls @@ -0,0 +1,297 @@ ++http://www.hostip.info/ +-http://www.elanceur.org/Articles/OntologieSurfaite.html ++http://www.opensymphony.com/quartz/ +-http://www.portletbridge.org/saxbenchmark/index.html ++http://www.lesmotsdelinfo.com/ ++http://usefulinc.com/doap/ ++http://www.codezoo.com/ ++http://search.infocious.com/ +-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html ++http://www.brics.dk/%7Eamoeller/automaton/ ++http://jazzz.com/wp.html ++http://www.maxkiesler.com/index.php ++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html ++http://www.alias-i.com/lingpipe/ +-http://johnny.ihackstuff.com/index.php?module=prodreviews +-http://www.spurl.net/ ++http://www.dropload.com/ ++http://vivisimo.com/ ++http://www.marumushi.com/apps/newsmap/newsmap.cfm ++http://www.ixquick.com/ +-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html ++http://www.mail-archive.com/ ++http://www.spymac.com/ +-http://browsers.evolt.org/ +-http://www.oswd.org/ ++http://www.stayinvisible.com/index.pl ++http://java.sun.com/j2se/1.4.2/docs/api/index.html ++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx ++http://www.bloglines.com/ +-http://www.fckeditor.net/ ++http://search.msn.com/ +-http://www.grub.org/ ++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html +-http://www.mnot.net/cache_docs/ +-http://www.furl.net/ ++http://www.blogpulse.com/ ++http://www.googlefight.com/ ++http://www.rokulabs.com/ +-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php +-http://www.batbox.org/wrt54g-linux.html +-http://en.wikipedia.org/wiki/%s ++http://www.sipcenter.com/ ++http://www.merriampark.com/ld.htm ++http://anon.inf.tu-dresden.de/index_en.html ++http://www.pluck.com/ ++http://www.tiddlywiki.com/ ++http://www.jux2.com/ ++http://clusty.com/ +-http://findability.org/ ++http://www.searchengineshowdown.com/ ++http://www.nhacks.com/email/index.php ++http://www.koders.com/ ++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf ++http://www.gmailwiki.com/index.php/Main_Page ++http://www.tadalist.com/ ++http://www.net2ftp.com/ ++http://www.streamload.com/ ++http://www.lucazappa.com/brilliantMaker/buttonImage.php ++http://www.hybernaut.com/bdv/delicious-import.html ++http://www.gtmcknight.com/buttons/ ++http://amb.vis.ne.jp/mozilla/scrapbook/ ++http://g-metrics.com/index.php +-http://tor.eff.org/ ++http://www.search-this.com/search_engine_decoder.asp ++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html ++http://www.adaptivepath.com/publications/essays/archives/000385.php +-http://isnoop.net/gmail/ +-http://openweb.eu.org/ ++http://www.mistergooddeal.com/ ++http://javatoolbox.com/ +-http://www.freenews.fr/ ++http://www.wikiwax.com/ +-http://today.java.net/pub/a/today/2005/04/21/farm.html ++http://users.skynet.be/J.Beever/pave.htm ++http://www.lundi8h.com/ ++http://www.snap.com/ ++http://www.goosee.com/puppy/index.shtml +-http://www.softwarefreedom.org/index.html +-http://y.20q.net/ ++http://www.bitty.com/ ++http://www.lafraise.com/ +-http://www.liquidinformation.org/ ++http://www.searchtools.com/ ++http://www.martinfowler.com/articles/injection.html ++http://pdos.csail.mit.edu/scigen/ +-http://developer.yahoo.net/blog/ ++http://blogger-templates.blogspot.com/ ++http://phpadsnew.com/two/ ++http://www.langreiter.com/exec/yahoo-vs-google.html +-http://www.dataparksearch.org/ +-http://www.yubnub.org/ +-http://www.fing.org/ +-http://www.swish-e.org/ +-http://www.openajax.net/wordpress/ ++http://crypto.stanford.edu/PwdHash/ ++http://www.html-kit.com/favicon/ +-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1 ++http://www.durhamtownship.com/ ++http://jiwire.com/ ++http://www.insilmaril.de/vym/ +-http://www.spreadshirt.net/ ++http://www.goffice.com/ ++http://www.writely.com/ ++http://www.milindparikh.com/ ++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html ++http://www.wikyblog.com/Map/Guest/Home +-http://www.kottke.org/05/08/googleos-webos ++http://www.rollyo.com/ ++http://www.meebo.com/ ++http://www.factbites.com/ ++http://www.placeopedia.com/ ++http://swoogle.umbc.edu/ ++http://www.viaduc.com/ +-http://demo.wikiwyg.net/wikiwyg/demo/standalone/ ++http://podcasts.yahoo.com/ +-http://beaglewiki.org/Main_Page ++http://yq.search.yahoo.com/ +-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1 ++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html ++http://socialight.com/ ++http://www.lexxe.com/ ++http://www.xom.nu/ ++http://www.turboprint.de/ ++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27 ++http://www.wi-fiplanet.com/tutorials/article.php/3562391 ++http://particletree.com/features/10-tips-to-a-better-form/ ++http://www.songbirdnest.com/ +-http://www.w3.org/Talks/Tools/Slidy/ +-http://www.compassframework.org/display/SITE/Home ++http://motrech.blogspot.com/ ++http://www.moteurzine.com/ ++http://www.mex-search.com/ +-http://beta.previewseek.com/?mdc=y&twin=n&ilang=french ++http://www.goshme.com/ ++http://rialto.application-servers.com/ ++http://www.multe-pass.com/ ++http://www.tailrank.com/ ++http://www.vandertramp.com/INTERNETDOWN/ ++http://www.letterjames.de/index.html ++http://code.google.com/index.html ++http://www.kritx.com/ ++http://performancing.com/firefox ++http://www.mywebsearch.com/ +-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1 ++http://www.lukew.com/resources/articles/blogs2.asp +-http://www.hyperwords.net/ ++http://ajax.parish.ath.cx/translator/ ++http://www.maplandia.com/ +-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages ++http://onefeed.com/index.php ++http://www.file-swap.com/ +-http://opennlp.org/ ++http://mindprod.com/jgloss/encoding.html ++http://code.google.com/webstats/index.html ++http://www.freeweb-hosting.com/google_pagerank_pr_checker/ +-http://www.framakey.org/ +-http://microformats.org/wiki/hreview +-http://www.ashesandsnow.org/index2.html +-http://uima-framework.sourceforge.net/ ++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html +-http://www.anandtech.com/IT/showdoc.aspx?i=2523&p=2 ++http://fr.techcrunch.com/ +-http://developer.yahoo.net/yui/ ++http://www.fredrikodman.com/ ++http://www.mpirical.com/companion/mpirical_companion.html ++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html +-http://k9copy.free.fr/ +-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 +-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design +-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2 ++http://blogokat.canalblog.com/archives/2005/11/02/882454.html ++http://robur.slu.se/jensl/xmlclitools/ +-http://www.internetactu.net/?p=6291 +-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1 ++http://www.memodata.com/2004/fr/alexandria/ +-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave ++http://www.randomerror.com/ ++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/ +-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395 +-http://interstices.info/display.jsp?id=c_15918 ++http://www.tech-invite.com/ ++http://www.croczilla.com/zap +-http://www.libervis.com/modules/wordpress/?p=13 ++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/ +-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm ++http://www.influo.com/ ++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html +-http://www.addnb.org/fr/docs/webinvisible.htm +-http://manhack.net/ +-http://www.jibaku.net/ ++http://www.pipologie.com/ ++http://christophenoel.blogspot.com/ +-http://www.seekport.fr/seekbot/ ++http://beta.exalead.com/ +-http://www.boolgum.fr/index.html ++http://www.kesako.canalblog.com/ ++http://loran.blogspot.com/ ++http://outils-recherche.blogspot.com/ ++http://www.art-dept.com/artists/giacobbe/ ++http://www.meggould.netfirms.com/site_seeingIII.htm ++http://www.freedpi.com/ ++http://www.frenchfred.com/ ++http://www.photoways.com/ +-http://freco.free.fr/index.htm +-http://triturages.free.fr/index.htm +-http://www.qsos.org/ ++http://www.alvis.info/alvis/ ++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/ +-http://www.shinux.org/ ++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml ++http://www.kurobox.com/online/tiki-index.php +-http://news.gmane.org/gmane.comp.misc.linkstation.linux ++http://www.imsbook.com/SIP-IMS-Standards-List.html +-http://incubator.apache.org/directory/subprojects/snickers/ +-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html +-http://sourceforge.net/projects/cryptix-asn1/ +-http://sourceforge.net/projects/basn/ +-http://asn1.elibel.tm.fr/fr/index.htm +-http://sourceforge.net/projects/a2j/ ++http://www.degrouptest.com/ ++http://interstices.info/ ++http://louvre-boite.viabloga.com/news/18.shtml +-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html ++http://poiplace.oabsoftware.nl/ +-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759 +-http://www.yoono.com/favorites.jsp?user-id=lquerel +-http://www.librecours.org/cgi-bin/main +-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1 +-http://limo.sourceforge.net/ ++http://www-scf.usc.edu/%7Emattmann/ ++http://spaces.msn.com/members/famillezen/ +-http://photos.joune.org/ +-http://www.canon.fr/paperart/ ++http://flash.eastweb.ru/files/20051024092150.swf ++http://www.xsltwiki.com/index.php/Main_Page ++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/ +-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31 ++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html +-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/ ++http://www.aeliosfinance.com/ ++http://www.capital-it.com/ +-http://www.tradedoubler.fr/pan/public/solutions/publisher +-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm ++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/ ++http://wanabo.com/ +-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1 +-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam ++http://aeliosfinance.com/ ++http://www.centreincubation.com/ ++http://www.franceincubation.com/ +-http://www.oseo.fr/ ++http://www.i18nfaq.com/chardet.html +-http://cpdetector.sourceforge.net/ ++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles ++http://chezlorry.ca/Accueil.htm ++http://cetnia.blogs.com/d_lires/ +-http://www.directwine.fr/ ++http://www.new-phenix.com/ +-http://upnp.sourceforge.net/ +-http://www.pixmania.fr/ +-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 ++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/ ++http://www.stepnewz.com/sn/default.asp ++http://opquast.com/ +-http://www.freeplayer.org/ +-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie +-http://atomcomputer.free.fr/fbox/ +-http://www.internetactu.net/index.php?p=6100 +-http://mammouthland.free.fr/cours/css/genecss.php +-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1 ++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html +-http://xml.apache.org/xalan-j/extensions.html ++http://developers.sun.com/foryourbusiness/jcc/ ++http://blogs.sun.com/roller/page/roumen/Weblog +-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1 +-http://blog.developpez.com/index.php?blog=51&p=1389&more=1&c=1&tb=1&pb=1 ++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/ ++http://odur.let.rug.nl/%7Evannoord/ +-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html +-http://artist.inist.fr/ ++http://www.elra.info/ +-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO ++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability ++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval ++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/ ++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/ ++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/ ++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/ ++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/ ++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html +-http://www.lexique.org/ ++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/ ++http://www.streamium.com/products/mx6000i/ +-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr +-http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73 ++http://www.tversity.com/ +-http://www.aspseek.org/index.php \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules new file mode 100644 index 0000000..8966183 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules @@ -0,0 +1,24 @@ +# The url filter file used by the crawl command. + +# Better for intranet crawling. +# Be sure to change MY.DOMAIN.NAME to your domain name. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-(file|ftp|mailto):.* + +# skip image and other suffixes we can't yet parse +-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png) + +# skip URLs containing certain characters as probable queries, etc. +-.*[?*!@=].* + +# accept hosts in MY.DOMAIN.NAME ++http://([a-z0-9]*\.)*MY.DOMAIN.NAME/.* + +# skip everything else +-.* http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls new file mode 100644 index 0000000..b1ad9b7 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls @@ -0,0 +1,8 @@ +-file://home/jc/nutch/index.html +-ftp://ftp.apache.org/nutch.html +-mailto:[email protected] +-news://any.news.server/comp.lang.java +-whois:/nutch.org ++http://MY.DOMAIN.NAME/ ++http://MY.DOMAIN.NAME/nutch ++http://www.MY.DOMAIN.NAME/ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules new file mode 100644 index 0000000..dfae8b0 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules @@ -0,0 +1,19 @@ +# The default url filter. +# Better for whole-internet crawling. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file: ftp: and mailto: urls +-(file|ftp|mailto):.* + +# skip image and other suffixes we can't yet parse +-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe) + +# skip URLs containing certain characters as probable queries, etc. +-.*[?*!@=].* + +# accept anything else ++.* http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls new file mode 100644 index 0000000..d3b1bf3 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls @@ -0,0 +1,11 @@ +-file://home/jc/nutch/index.html +-ftp://ftp.apache.org/nutch.html +-mailto:[email protected] ++news://any.news.server/comp.lang.java ++whois:/nutch.org +-http://www.nutch.org/nutch.gif +-http://www.nutch.org/nutch.eps +-http://www.nutch.org/nutch?q=nutch ++http://www.nutch.org/ ++http://www.nutch.org/abcd/foo/bar/foo/bar/foo/ ++http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/build.xml b/nutch-plugins/urlfilter-domain/build.xml new file mode 100644 index 0000000..4af55ac --- /dev/null +++ b/nutch-plugins/urlfilter-domain/build.xml @@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-domain" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/ivy.xml b/nutch-plugins/urlfilter-domain/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/urlfilter-domain/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/plugin.xml b/nutch-plugins/urlfilter-domain/plugin.xml new file mode 100644 index 0000000..1452d58 --- /dev/null +++ b/nutch-plugins/urlfilter-domain/plugin.xml @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-domain" + name="Domain URL Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-domain.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.domain" + name="Nutch Domain URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="DomainURLFilter" + class="org.apache.nutch.urlfilter.domain.DomainURLFilter"> + <parameter name="file" value="domain-urlfilter.txt"/> + </implementation> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/pom.xml b/nutch-plugins/urlfilter-domain/pom.xml new file mode 100644 index 0000000..0c9dddd --- /dev/null +++ b/nutch-plugins/urlfilter-domain/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlfilter-domain</artifactId> + <packaging>jar</packaging> + + <name>urlfilter-domain</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java new file mode 100644 index 0000000..821d944 --- /dev/null +++ b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.domain; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.LinkedHashSet; +import java.util.Set; + +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLFilter; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.URLUtil; +import org.apache.nutch.util.domain.DomainSuffix; + +/** + * <p> + * Filters URLs based on a file containing domain suffixes, domain names, and + * hostnames. Only a url that matches one of the suffixes, domains, or hosts + * present in the file is allowed. + * </p> + * + * <p> + * Urls are checked in order of domain suffix, domain name, and hostname against + * entries in the domain file. The domain file would be setup as follows with + * one entry per line: + * + * <pre> + * com apache.org www.apache.org + * </pre> + * + * <p> + * The first line is an example of a filter that would allow all .com domains. + * The second line allows all urls from apache.org and all of its subdomains + * such as lucene.apache.org and hadoop.apache.org. The third line would allow + * only urls from www.apache.org. There is no specific ordering to entries. The + * entries are from more general to more specific with the more general + * overridding the more specific. + * </p> + * + * The domain file defaults to domain-urlfilter.txt in the classpath but can be + * overridden using the: + * + * <ul> + * <ol> + * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and + * </ol> + * <ol> + * attribute "file" in plugin.xml of this plugin + * </ol> + * </ul> + * + * the attribute "file" has higher precedence if defined. + */ +public class DomainURLFilter implements URLFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(DomainURLFilter.class); + + // read in attribute "file" of this plugin. + private static String attributeFile = null; + private Configuration conf; + private String domainFile = null; + private Set<String> domainSet = new LinkedHashSet<String>(); + + private void readConfiguration(Reader configReader) throws IOException { + + // read the configuration file, line by line + BufferedReader reader = new BufferedReader(configReader); + String line = null; + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + // add non-blank lines and non-commented lines + domainSet.add(StringUtils.lowerCase(line.trim())); + } + } + } + + /** + * Default constructor. + */ + public DomainURLFilter() { + + } + + /** + * Constructor that specifies the domain file to use. + * + * @param domainFile + * The domain file, overrides domain-urlfilter.text default. + * + * @throws IOException + */ + public DomainURLFilter(String domainFile) { + this.domainFile = domainFile; + } + + /** + * Sets the configuration. + */ + public void setConf(Configuration conf) { + this.conf = conf; + + // get the extensions for domain urlfilter + String pluginName = "urlfilter-domain"; + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + + // handle blank non empty input + if (attributeFile != null && attributeFile.trim().equals("")) { + attributeFile = null; + } + + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } else { + if (LOG.isWarnEnabled()) { + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + + pluginName); + } + } + + // domain file and attribute "file" take precedence if defined + String file = conf.get("urlfilter.domain.file"); + String stringRules = conf.get("urlfilter.domain.rules"); + if (domainFile != null) { + file = domainFile; + } else if (attributeFile != null) { + file = attributeFile; + } + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + try { + if (reader == null) { + reader = new FileReader(file); + } + readConfiguration(reader); + } catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + public Configuration getConf() { + return this.conf; + } + + public String filter(String url) { + // https://issues.apache.org/jira/browse/NUTCH-2189 + if (domainSet.size() == 0) return url; + + try { + // match for suffix, domain, and host in that order. more general will + // override more specific + String domain = URLUtil.getDomainName(url).toLowerCase().trim(); + String host = URLUtil.getHost(url); + String suffix = null; + DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url); + if (domainSuffix != null) { + suffix = domainSuffix.getDomain(); + } + + if (domainSet.contains(suffix) || domainSet.contains(domain) + || domainSet.contains(host)) { + return url; + } + + // doesn't match, don't allow + return null; + } catch (Exception e) { + + // if an error happens, allow the url to pass + LOG.error("Could not apply filter on url: " + url + "\n" + + org.apache.hadoop.util.StringUtils.stringifyException(e)); + return null; + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java new file mode 100644 index 0000000..d2eba1f --- /dev/null +++ b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin to include only URLs which match an element in a given list of + * domain suffixes, domain names, and/or host names. + * See {@link org.apache.nutch.urlfilter.domainblacklist} for the counterpart + * (exclude URLs by host or domain). + */ +package org.apache.nutch.urlfilter.domain; +
