http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-anchor/ivy.xml b/src/plugin/index-anchor/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/index-anchor/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-anchor/plugin.xml b/src/plugin/index-anchor/plugin.xml deleted file mode 100644 index 208594b..0000000 --- a/src/plugin/index-anchor/plugin.xml +++ /dev/null @@ -1,38 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin id="index-anchor" name="Anchor Indexing Filter" version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="index-anchor.jar"> - <export name="*" /> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints" /> - </requires> - - <extension id="org.apache.nutch.indexer.anchor" - name="Nutch Anchor Indexing Filter" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="AnchorIndexingFilter" - class="org.apache.nutch.indexer.anchor.AnchorIndexingFilter" /> - </extension> - -</plugin> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java deleted file mode 100644 index 6c9b834..0000000 --- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.anchor; - -import java.util.HashSet; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.parse.Parse; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Indexing filter that offers an option to either index all inbound anchor text - * for a document or deduplicate anchors. Deduplication does have it's con's, - * - * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. - */ -public class AnchorIndexingFilter implements IndexingFilter { - - public static final Logger LOG = LoggerFactory - .getLogger(AnchorIndexingFilter.class); - private Configuration conf; - private boolean deduplicate = false; - - /** - * Set the {@link Configuration} object - */ - public void setConf(Configuration conf) { - this.conf = conf; - - deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false); - LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off")); - } - - /** - * Get the {@link Configuration} object - */ - public Configuration getConf() { - return this.conf; - } - - /** - * The {@link AnchorIndexingFilter} filter object which supports boolean - * configuration settings for the deduplication of anchors. See - * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. - * - * @param doc - * The {@link NutchDocument} object - * @param parse - * The relevant {@link Parse} object passing through the filter - * @param url - * URL to be filtered for anchor text - * @param datum - * The {@link CrawlDatum} entry - * @param inlinks - * The {@link Inlinks} containing anchor text - * @return filtered NutchDocument - */ - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - - String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]); - - HashSet<String> set = null; - - for (int i = 0; i < anchors.length; i++) { - if (deduplicate) { - if (set == null) - set = new HashSet<String>(); - String lcAnchor = anchors[i].toLowerCase(); - - // Check if already processed the current anchor - if (!set.contains(lcAnchor)) { - doc.add("anchor", anchors[i]); - - // Add to map - set.add(lcAnchor); - } - } else { - doc.add("anchor", anchors[i]); - } - } - - return doc; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html deleted file mode 100644 index c255029..0000000 --- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>An indexing plugin for inbound anchor text.</p><p></p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java deleted file mode 100644 index 08a42f3..0000000 --- a/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.anchor; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlink; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -/** - * JUnit test case which tests 1. that anchor text is obtained 2. that anchor - * deduplication functionality is working - * - * @author lewismc - * - */ -public class TestAnchorIndexingFilter { - - @Test - public void testDeduplicateAnchor() throws Exception { - Configuration conf = NutchConfiguration.create(); - conf.setBoolean("anchorIndexingFilter.deduplicate", true); - AnchorIndexingFilter filter = new AnchorIndexingFilter(); - filter.setConf(conf); - Assert.assertNotNull(filter); - NutchDocument doc = new NutchDocument(); - ParseImpl parse = new ParseImpl("foo bar", new ParseData()); - Inlinks inlinks = new Inlinks(); - inlinks.add(new Inlink("http://test1.com/", "text1")); - inlinks.add(new Inlink("http://test2.com/", "text2")); - inlinks.add(new Inlink("http://test3.com/", "text2")); - try { - filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), - new CrawlDatum(), inlinks); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.getMessage()); - } - Assert.assertNotNull(doc); - Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames() - .contains("anchor")); - Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor") - .getValues().size()); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-basic/build.xml b/src/plugin/index-basic/build.xml deleted file mode 100755 index a834290..0000000 --- a/src/plugin/index-basic/build.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="index-basic" default="jar-core"> - - <import file="../build-plugin.xml"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-basic/ivy.xml b/src/plugin/index-basic/ivy.xml deleted file mode 100644 index 848216e..0000000 --- a/src/plugin/index-basic/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-basic/plugin.xml b/src/plugin/index-basic/plugin.xml deleted file mode 100755 index c5d784d..0000000 --- a/src/plugin/index-basic/plugin.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="index-basic" - name="Basic Indexing Filter" - version="1.0.0" - provider-name="nutch.org"> - - - <runtime> - <library name="index-basic.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.indexer.basic" - name="Nutch Basic Indexing Filter" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="BasicIndexingFilter" - class="org.apache.nutch.indexer.basic.BasicIndexingFilter"/> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java deleted file mode 100644 index 8584fa8..0000000 --- a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java +++ /dev/null @@ -1,158 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.indexer.basic; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.nutch.metadata.Nutch; -import org.apache.nutch.parse.Parse; - -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.util.StringUtil; -import org.apache.nutch.util.URLUtil; -import org.apache.hadoop.io.Text; - -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; - -import java.net.MalformedURLException; -import java.net.URL; -import java.util.Date; - -import org.apache.hadoop.conf.Configuration; - -/** - * Adds basic searchable fields to a document. The fields added are : domain, - * host, url, content, title, cache, tstamp domain is included depending on - * {@code indexer.add.domain} in nutch-default.xml. title is truncated as per - * {@code indexer.max.title.length} in nutch-default.xml. (As per NUTCH-1004, a - * zero-length title is not added) content is truncated as per - * {@code indexer.max.content.length} in nutch-default.xml. - */ -public class BasicIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory - .getLogger(BasicIndexingFilter.class); - - private int MAX_TITLE_LENGTH; - private int MAX_CONTENT_LENGTH; - private boolean addDomain = false; - private Configuration conf; - - /** - * The {@link BasicIndexingFilter} filter object which supports few - * configuration settings for adding basic searchable fields. See - * {@code indexer.add.domain}, {@code indexer.max.title.length}, - * {@code indexer.max.content.length} in nutch-default.xml. - * - * @param doc - * The {@link NutchDocument} object - * @param parse - * The relevant {@link Parse} object passing through the filter - * @param url - * URL to be filtered for anchor text - * @param datum - * The {@link CrawlDatum} entry - * @param inlinks - * The {@link Inlinks} containing anchor text - * @return filtered NutchDocument - */ - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - - Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); - String reprUrlString = reprUrl != null ? reprUrl.toString() : null; - String urlString = url.toString(); - - String host = null; - try { - URL u; - if (reprUrlString != null) { - u = new URL(reprUrlString); - } else { - u = new URL(urlString); - } - - if (addDomain) { - doc.add("domain", URLUtil.getDomainName(u)); - } - - host = u.getHost(); - } catch (MalformedURLException e) { - throw new IndexingException(e); - } - - if (host != null) { - doc.add("host", host); - } - - doc.add("url", reprUrlString == null ? urlString : reprUrlString); - - // content - String content = parse.getText(); - if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) { - content = content.substring(0, MAX_CONTENT_LENGTH); - } - doc.add("content", StringUtil.cleanField(content)); - - // title - String title = parse.getData().getTitle(); - if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate - // title - // if - // needed - title = title.substring(0, MAX_TITLE_LENGTH); - } - - if (title.length() > 0) { - // NUTCH-1004 Do not index empty values for title field - doc.add("title", StringUtil.cleanField(title)); - } - - // add cached content/summary display policy, if available - String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY); - if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) { - doc.add("cache", caching); - } - - // add timestamp when fetched, for deduplication - doc.add("tstamp", new Date(datum.getFetchTime())); - - return doc; - } - - /** - * Set the {@link Configuration} object - */ - public void setConf(Configuration conf) { - this.conf = conf; - this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); - this.addDomain = conf.getBoolean("indexer.add.domain", false); - this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1); - } - - /** - * Get the {@link Configuration} object - */ - public Configuration getConf() { - return this.conf; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html deleted file mode 100644 index 3fae405..0000000 --- a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>A basic indexing plugin, adds basic fields: url, host, title, content, etc.</p><p></p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java deleted file mode 100644 index 4bc317e..0000000 --- a/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.basic; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.indexer.basic.BasicIndexingFilter; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -import java.util.Date; - -/** - * JUnit test case which tests 1. that basic searchable fields are added to a - * document 2. that domain is added as per {@code indexer.add.domain} in - * nutch-default.xml. 3. that title is truncated as per - * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is - * truncated as per {@code indexer.max.content.length} in nutch-default.xml. - * - * @author tejasp - * - */ - -public class TestBasicIndexingFilter { - - @Test - public void testBasicIndexingFilter() throws Exception { - Configuration conf = NutchConfiguration.create(); - conf.setInt("indexer.max.title.length", 10); - conf.setBoolean("indexer.add.domain", true); - conf.setInt("indexer.max.content.length", 20); - - BasicIndexingFilter filter = new BasicIndexingFilter(); - filter.setConf(conf); - Assert.assertNotNull(filter); - - NutchDocument doc = new NutchDocument(); - - String title = "The Foo Page"; - Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") }; - Metadata metaData = new Metadata(); - metaData.add("Language", "en/us"); - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, - outlinks, metaData); - ParseImpl parse = new ParseImpl( - "this is a sample foo bar page. hope you enjoy it.", parseData); - - CrawlDatum crawlDatum = new CrawlDatum(); - crawlDatum.setFetchTime(100L); - - Inlinks inlinks = new Inlinks(); - - try { - filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), - crawlDatum, inlinks); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.getMessage()); - } - Assert.assertNotNull(doc); - Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc - .getField("title").getValues().get(0)); - Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc - .getField("domain").getValues().get(0)); - Assert.assertEquals("test host, expect \"nutch.apache.org\"", - "nutch.apache.org", doc.getField("host").getValues().get(0)); - Assert.assertEquals( - "test url, expect \"http://nutch.apache.org/index.html\"", - "http://nutch.apache.org/index.html", doc.getField("url").getValues() - .get(0)); - Assert.assertEquals("test content", "this is a sample foo", - doc.getField("content").getValues().get(0)); - Assert.assertEquals("test fetch time", new Date(100L), - (Date) doc.getField("tstamp").getValues().get(0)); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/build-ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-geoip/build-ivy.xml b/src/plugin/index-geoip/build-ivy.xml deleted file mode 100644 index 2cda7e9..0000000 --- a/src/plugin/index-geoip/build-ivy.xml +++ /dev/null @@ -1,54 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="index-geoip" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> - - <property name="ivy.install.version" value="2.1.0" /> - <condition property="ivy.home" value="${env.IVY_HOME}"> - <isset property="env.IVY_HOME" /> - </condition> - <property name="ivy.home" value="${user.home}/.ant" /> - <property name="ivy.checksums" value="" /> - <property name="ivy.jar.dir" value="${ivy.home}/lib" /> - <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> - - <target name="download-ivy" unless="offline"> - - <mkdir dir="${ivy.jar.dir}"/> - <!-- download Ivy from web site so that it can be used even without any special installation --> - <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" - dest="${ivy.jar.file}" usetimestamp="true"/> - </target> - - <target name="init-ivy" depends="download-ivy"> - <!-- try to load ivy here from ivy home, in case the user has not already dropped - it into ant's lib dir (note that the latter copy will always take precedence). - We will not fail as long as local lib dir exists (it may be empty) and - ivy is in at least one of ant's lib dir or the local lib dir. --> - <path id="ivy.lib.path"> - <fileset dir="${ivy.jar.dir}" includes="*.jar"/> - - </path> - <taskdef resource="org/apache/ivy/ant/antlib.xml" - uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> - </target> - - <target name="deps-jar" depends="init-ivy"> - <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-geoip/build.xml b/src/plugin/index-geoip/build.xml deleted file mode 100644 index 92fda82..0000000 --- a/src/plugin/index-geoip/build.xml +++ /dev/null @@ -1,27 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="index-geoip" default="jar-core"> - - <import file="../build-plugin.xml"/> - <target name="init-plugin"> - <echo>Copying MaxMind GeoIP .mmdb files to build</echo> - <copy todir="${build.classes}"> - <fileset dir="${src.dir}" includes="**/*.mmdb" /> - </copy> - </target> -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml deleted file mode 100644 index 1b626f0..0000000 --- a/src/plugin/index-geoip/ivy.xml +++ /dev/null @@ -1,46 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.3.1" > - <!-- Exlude due to classpath issues --> - <exclude org="org.apache.httpcomponents" name="httpclient" /> - <exclude org="org.apache.httpcomponents" name="httpcore" /> - </dependency> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml deleted file mode 100644 index 214fbd0..0000000 --- a/src/plugin/index-geoip/plugin.xml +++ /dev/null @@ -1,51 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="index-geoip" - name="GeoIP2 Indexing Filter" - version="1.0.0" - provider-name="nutch.org"> - - - <runtime> - <library name="index-geoip.jar"> - <export name="*"/> - </library> - <library name="commons-codec-1.6.jar"/> - <library name="commons-logging-1.1.1.jar"/> - <library name="geoip2-2.3.1.jar"/> - <library name="google-http-client-1.20.0.jar"/> - <library name="jackson-annotations-2.5.0.jar"/> - <library name="jackson-core-2.5.3.jar"/> - <library name="jackson-databind-2.5.3.jar"/> - <library name="jsr305-1.3.9.jar"/> - <library name="maxmind-db-1.0.0.jar"/> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.indexer.geoip" - name="Nutch GeoIP2 Indexing Filter" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="GeoIPIndexingFilter" - class="org.apache.nutch.indexer.geoip.GeoIPIndexingFilter"/> - </extension> - -</plugin> - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java deleted file mode 100644 index 88d78ef..0000000 --- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java +++ /dev/null @@ -1,210 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.geoip; - -import java.io.IOException; -import java.net.InetAddress; -import java.net.UnknownHostException; - -import org.apache.nutch.indexer.NutchDocument; - -import com.maxmind.geoip2.DatabaseReader; -import com.maxmind.geoip2.WebServiceClient; -import com.maxmind.geoip2.exception.GeoIp2Exception; -import com.maxmind.geoip2.model.InsightsResponse; -import com.maxmind.geoip2.model.CityResponse; -import com.maxmind.geoip2.model.ConnectionTypeResponse; -import com.maxmind.geoip2.model.CountryResponse; -import com.maxmind.geoip2.model.DomainResponse; -import com.maxmind.geoip2.model.IspResponse; -import com.maxmind.geoip2.record.City; -import com.maxmind.geoip2.record.Continent; -import com.maxmind.geoip2.record.Country; -import com.maxmind.geoip2.record.Location; -import com.maxmind.geoip2.record.Postal; -import com.maxmind.geoip2.record.RepresentedCountry; -import com.maxmind.geoip2.record.Subdivision; -import com.maxmind.geoip2.record.Traits; - -/** - * <p> - * Simple utility class which enables efficient, structured - * {@link org.apache.nutch.indexer.NutchDocument} building based on input from - * {@link GeoIPIndexingFilter}, where configuration is also read. - * </p> - * <p> - * Based on the nature of the input, this class wraps factory type - * implementations for populating {@link org.apache.nutch.indexer.NutchDocument} - * 's with the correct {@link org.apache.nutch.indexer.NutchField} information. - * - */ -public class GeoIPDocumentCreator { - - /** - * Default constructor. - */ - public GeoIPDocumentCreator() { - } - - public static NutchDocument createDocFromInsightsService(String serverIp, - NutchDocument doc, WebServiceClient client) throws UnknownHostException, - IOException, GeoIp2Exception { - doc.add("ip", serverIp); - InsightsResponse response = client - .insights(InetAddress.getByName(serverIp)); - // CityResponse response = client.city(InetAddress.getByName(serverIp)); - - City city = response.getCity(); - doc.add("cityName", city.getName()); // 'Minneapolis' - doc.add("cityConfidence", city.getConfidence()); // 50 - doc.add("cityGeoNameId", city.getGeoNameId()); - - Continent continent = response.getContinent(); - doc.add("continentCode", continent.getCode()); - doc.add("continentGeoNameId", continent.getGeoNameId()); - doc.add("continentName", continent.getName()); - - Country country = response.getCountry(); - doc.add("countryIsoCode", country.getIsoCode()); // 'US' - doc.add("countryName", country.getName()); // 'United States' - doc.add("countryConfidence", country.getConfidence()); // 99 - doc.add("countryGeoName", country.getGeoNameId()); - - Location location = response.getLocation(); - doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, - // -93.2323 - doc.add("accRadius", location.getAccuracyRadius()); // 3 - doc.add("timeZone", location.getTimeZone()); // 'America/Chicago' - doc.add("metroCode", location.getMetroCode()); - - Postal postal = response.getPostal(); - doc.add("postalCode", postal.getCode()); // '55455' - doc.add("postalConfidence", postal.getConfidence()); // 40 - - RepresentedCountry rCountry = response.getRepresentedCountry(); - doc.add("countryType", rCountry.getType()); - - Subdivision subdivision = response.getMostSpecificSubdivision(); - doc.add("subDivName", subdivision.getName()); // 'Minnesota' - doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN' - doc.add("subDivConfidence", subdivision.getConfidence()); // 90 - doc.add("subDivGeoNameId", subdivision.getGeoNameId()); - - Traits traits = response.getTraits(); - doc.add("autonSystemNum", traits.getAutonomousSystemNumber()); - doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization()); - doc.add("domain", traits.getDomain()); - doc.add("isp", traits.getIsp()); - doc.add("org", traits.getOrganization()); - doc.add("userType", traits.getUserType()); - doc.add("isAnonProxy", traits.isAnonymousProxy()); - doc.add("isSatelliteProv", traits.isSatelliteProvider()); - return doc; - } - - @SuppressWarnings("unused") - public static NutchDocument createDocFromCityService(String serverIp, - NutchDocument doc, WebServiceClient client) throws UnknownHostException, - IOException, GeoIp2Exception { - CityResponse response = client.city(InetAddress.getByName(serverIp)); - return doc; - } - - @SuppressWarnings("unused") - public static NutchDocument createDocFromCountryService(String serverIp, - NutchDocument doc, WebServiceClient client) throws UnknownHostException, - IOException, GeoIp2Exception { - CountryResponse response = client.country(InetAddress.getByName(serverIp)); - return doc; - } - - public static NutchDocument createDocFromIspDb(String serverIp, - NutchDocument doc, DatabaseReader reader) throws UnknownHostException, - IOException, GeoIp2Exception { - IspResponse response = reader.isp(InetAddress.getByName(serverIp)); - doc.add("ip", serverIp); - doc.add("autonSystemNum", response.getAutonomousSystemNumber()); - doc.add("autonSystemOrg", response.getAutonomousSystemOrganization()); - doc.add("isp", response.getIsp()); - doc.add("org", response.getOrganization()); - return doc; - } - - public static NutchDocument createDocFromDomainDb(String serverIp, - NutchDocument doc, DatabaseReader reader) throws UnknownHostException, - IOException, GeoIp2Exception { - DomainResponse response = reader.domain(InetAddress.getByName(serverIp)); - doc.add("ip", serverIp); - doc.add("domain", response.getDomain()); - return doc; - } - - public static NutchDocument createDocFromConnectionDb(String serverIp, - NutchDocument doc, DatabaseReader reader) throws UnknownHostException, - IOException, GeoIp2Exception { - ConnectionTypeResponse response = reader.connectionType(InetAddress - .getByName(serverIp)); - doc.add("ip", serverIp); - doc.add("connType", response.getConnectionType().toString()); - return doc; - } - - public static NutchDocument createDocFromCityDb(String serverIp, - NutchDocument doc, DatabaseReader reader) throws UnknownHostException, - IOException, GeoIp2Exception { - doc.add("ip", serverIp); - CityResponse response = reader.city(InetAddress.getByName(serverIp)); - - City city = response.getCity(); - doc.add("cityName", city.getName()); // 'Minneapolis' - doc.add("cityConfidence", city.getConfidence()); // 50 - doc.add("cityGeoNameId", city.getGeoNameId()); - - Continent continent = response.getContinent(); - doc.add("continentCode", continent.getCode()); - doc.add("continentGeoNameId", continent.getGeoNameId()); - doc.add("continentName", continent.getName()); - - Country country = response.getCountry(); - doc.add("countryIsoCode", country.getIsoCode()); // 'US' - doc.add("countryName", country.getName()); // 'United States' - doc.add("countryConfidence", country.getConfidence()); // 99 - doc.add("countryGeoName", country.getGeoNameId()); - - Location location = response.getLocation(); - doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, - // -93.2323 - doc.add("accRadius", location.getAccuracyRadius()); // 3 - doc.add("timeZone", location.getTimeZone()); // 'America/Chicago' - doc.add("metroCode", location.getMetroCode()); - - Postal postal = response.getPostal(); - doc.add("postalCode", postal.getCode()); // '55455' - doc.add("postalConfidence", postal.getConfidence()); // 40 - - RepresentedCountry rCountry = response.getRepresentedCountry(); - doc.add("countryType", rCountry.getType()); - - Subdivision subdivision = response.getMostSpecificSubdivision(); - doc.add("subDivName", subdivision.getName()); // 'Minnesota' - doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN' - doc.add("subDivConfidence", subdivision.getConfidence()); // 90 - doc.add("subDivGeoNameId", subdivision.getGeoNameId()); - return doc; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java deleted file mode 100644 index f515f1f..0000000 --- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java +++ /dev/null @@ -1,241 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.geoip; - -import java.io.File; -import java.io.IOException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.maxmind.geoip2.DatabaseReader; -import com.maxmind.geoip2.WebServiceClient; - -/** - * <p> - * This plugin implements an indexing filter which takes advantage of the <a - * href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>. - * </p> - * <p> - * The third party library distribution provides an API for the GeoIP2 <a - * href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web - * services</a> and <a - * href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. The - * API also works with the free <a - * href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>. - * </p> - * <p> - * Depending on the service level agreement, you have with the GeoIP service - * provider, the plugin can add a number of the following fields to the index - * data model: - * <ol> - * <li>Continent</li> - * <li>Country</li> - * <li>Regional Subdivision</li> - * <li>City</li> - * <li>Postal Code</li> - * <li>Latitude/Longitude</li> - * <li>ISP/Organization</li> - * <li>AS Number</li> - * <li>Confidence Factors</li> - * <li>Radius</li> - * <li>User Type</li> - * </ol> - * </p> - * - * <p> - * Some of the services are documented at the <a - * href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision - * Services</a> webpage where more information can be obtained. - * </p> - * - * <p> - * You should also consult the following three properties in - * <code>nutch-site.xml</code> - * </p> - * - * <pre> - * {@code - * <!-- index-geoip plugin properties --> - * <property> - * <name>index.geoip.usage</name> - * <value>insightsService</value> - * <description> - * A string representing the information source to be used for GeoIP information - * association. Either enter 'cityDatabase', 'connectionTypeDatabase', - * 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the - * Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, - * GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath - * and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf - * </description> - * </property> - * - * <property> - * <name>index.geoip.userid</name> - * <value></value> - * <description> - * The userId associated with the GeoIP2 Precision Services account. - * </description> - * </property> - * - * <property> - * <name>index.geoip.licensekey</name> - * <value></value> - * <description> - * The license key associated with the GeoIP2 Precision Services account. - * </description> - * </property> - * } - * </pre> - * - */ -public class GeoIPIndexingFilter implements IndexingFilter { - - private static final Logger LOG = LoggerFactory - .getLogger(GeoIPIndexingFilter.class); - - private Configuration conf; - - private String usage = null; - - private File geoDb = null; - - WebServiceClient client = null; - - DatabaseReader reader = null; - - // private AbstractResponse response = null; - - /** - * Default constructor for this plugin - */ - public GeoIPIndexingFilter() { - } - - /** - * @see org.apache.hadoop.conf.Configurable#getConf() - */ - @Override - public Configuration getConf() { - return this.conf; - } - - /** - * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration) - */ - @Override - public void setConf(Configuration conf) { - this.conf = conf; - String use = conf.get("index.geoip.usage", "insightsService"); - LOG.debug("GeoIP usage medium set to: {}", use); - if (use.equalsIgnoreCase("cityDatabase")) { - try { - geoDb = new File(conf.getResource("GeoIP2-City.mmdb").getFile()); - buildDb(); - } catch (Exception e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - } - } else if (use.equalsIgnoreCase("connectionTypeDatabase")) { - try { - geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb") - .getFile()); - buildDb(); - } catch (Exception e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - } - } else if (use.equalsIgnoreCase("domainDatabase")) { - try { - geoDb = new File(conf.getResource("GeoIP2-Domain.mmdb").getFile()); - buildDb(); - } catch (Exception e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - } - } else if (use.equalsIgnoreCase("ispDatabase")) { - try { - geoDb = new File(conf.getResource("GeoIP2-ISP.mmdb").getFile()); - buildDb(); - } catch (Exception e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - } - } else if (use.equalsIgnoreCase("insightsService")) { - client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid", - 12345), conf.get("index.geoip.licensekey")).build(); - } - usage = use; - } - - private void buildDb() { - try { - reader = new DatabaseReader.Builder(geoDb).build(); - } catch (IOException e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - } - } - - /** - * - * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument, - * org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text, - * org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks) - */ - @Override - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - return addServerGeo(doc, parse.getData(), url.toString()); - } - - private NutchDocument addServerGeo(NutchDocument doc, ParseData data, - String url) { - - if (conf.getBoolean("store.ip.address", false) == true) { - try { - String serverIp = data.getContentMeta().get("_ip_"); - if (serverIp != null) { - if (usage.equalsIgnoreCase("cityDatabase")) { - doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc, - reader); - } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) { - doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc, - reader); - } else if (usage.equalsIgnoreCase("domainDatabase")) { - doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc, - reader); - } else if (usage.equalsIgnoreCase("ispDatabase")) { - doc = GeoIPDocumentCreator - .createDocFromIspDb(serverIp, doc, reader); - } else if (usage.equalsIgnoreCase("insightsService")) { - doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, - doc, client); - } - } - } catch (Exception e) { - LOG.error(e.getMessage()); - e.printStackTrace(); - } - } - return doc; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java deleted file mode 100644 index ba62519..0000000 --- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * <p>This plugin implements an indexing filter which takes - * advantage of the - * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p> - * <p>The third party library distribution provides an API for the GeoIP2 - * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web services</a> - * and <a href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. - * The API also works with the free - * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>. - * - */ -package org.apache.nutch.indexer.geoip; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-links/build.xml b/src/plugin/index-links/build.xml deleted file mode 100644 index b853ccf..0000000 --- a/src/plugin/index-links/build.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="index-links" default="jar-core"> - - <import file="../build-plugin.xml"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-links/ivy.xml b/src/plugin/index-links/ivy.xml deleted file mode 100644 index 0a363f7..0000000 --- a/src/plugin/index-links/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="${nutch.root}/ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-links/plugin.xml b/src/plugin/index-links/plugin.xml deleted file mode 100644 index dfdc5d2..0000000 --- a/src/plugin/index-links/plugin.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="index-links" - name="Index inlinks and outlinks" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="index-links.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.indexer.links.LinksIndexingFilter" - name="Links indexing filter" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="org.apache.nutch.indexer.links.LinksIndexingFilter" - class="org.apache.nutch.indexer.links.LinksIndexingFilter"/> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java b/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java deleted file mode 100644 index 975df66..0000000 --- a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java +++ /dev/null @@ -1,167 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.links; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlink; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.Parse; -import org.slf4j.LoggerFactory; - -import java.net.MalformedURLException; -import java.net.URL; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Set; - -/** - * An {@link org.apache.nutch.indexer.IndexingFilter} that adds - * <code>outlinks</code> and <code>inlinks</code> field(s) to the document. - * - * In case that you want to ignore the outlinks that point to the same host - * as the URL being indexed use the following settings in your configuration - * file: - * - * <property> - * <name>index.links.outlinks.host.ignore</name> - * <value>true</value> - * </property> - * - * The same configuration is available for inlinks: - * - * <property> - * <name>index.links.inlinks.host.ignore</name> - * <value>true</value> - * </property> - * - * To store only the host portion of each inlink URL or outlink URL add the - * following to your configuration file. - * - * <property> - * <name>index.links.hosts.only</name> - * <value>false</value> - * </property> - * - */ -public class LinksIndexingFilter implements IndexingFilter { - - public final static String LINKS_OUTLINKS_HOST = "index.links.outlinks.host.ignore"; - public final static String LINKS_INLINKS_HOST = "index.links.inlinks.host.ignore"; - public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only"; - - public final static org.slf4j.Logger LOG = LoggerFactory - .getLogger(LinksIndexingFilter.class); - - private Configuration conf; - private boolean filterOutlinks; - private boolean filterInlinks; - private boolean indexHost; - - @Override - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - - // Add the outlinks - Outlink[] outlinks = parse.getData().getOutlinks(); - - if (outlinks != null) { - Set<String> hosts = new HashSet<String>(); - - for (Outlink outlink : outlinks) { - try { - String linkUrl = outlink.getToUrl(); - String outHost = new URL(linkUrl).getHost().toLowerCase(); - - if (indexHost) { - linkUrl = outHost; - - if (hosts.contains(linkUrl)) - continue; - - hosts.add(linkUrl); - } - - addFilteredLink("outlinks", url.toString(), linkUrl, outHost, - filterOutlinks, doc); - } catch (MalformedURLException e) { - LOG.error("Malformed URL in {}: {}", url, e.getMessage()); - } - } - } - - // Add the inlinks - if (null != inlinks) { - Iterator<Inlink> iterator = inlinks.iterator(); - Set<String> inlinkHosts = new HashSet<String>(); - - while (iterator.hasNext()) { - try { - Inlink link = iterator.next(); - String linkUrl = link.getFromUrl(); - String inHost = new URL(linkUrl).getHost().toLowerCase(); - - if (indexHost) { - linkUrl = inHost; - - if (inlinkHosts.contains(linkUrl)) - continue; - - inlinkHosts.add(linkUrl); - } - - addFilteredLink("inlinks", url.toString(), linkUrl, inHost, - filterInlinks, doc); - } catch (MalformedURLException e) { - LOG.error("Malformed URL in {}: {}", url, e.getMessage()); - } - } - } - - return doc; - } - - private void addFilteredLink(String fieldName, String url, String linkUrl, - String urlHost, boolean filter, NutchDocument doc) throws MalformedURLException { - if (filter) { - String host = new URL(url.toString()).getHost().toLowerCase(); - - if (!host.equalsIgnoreCase(urlHost)) { - doc.add(fieldName, linkUrl); - } - } else { - doc.add(fieldName, linkUrl); - } - } - - public void setConf(Configuration conf) { - this.conf = conf; - filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false); - filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false); - - indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false); - } - - public Configuration getConf() { - return this.conf; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java deleted file mode 100644 index c490d1f..0000000 --- a/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java +++ /dev/null @@ -1,218 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.links; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlink; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.indexer.NutchField; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.util.NutchConfiguration; - -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.net.URL; -import java.util.Iterator; - -public class TestLinksIndexingFilter { - - Configuration conf = NutchConfiguration.create(); - LinksIndexingFilter filter = new LinksIndexingFilter(); - Metadata metadata = new Metadata(); - - @Before - public void setUp() throws Exception { - metadata.add(Response.CONTENT_TYPE, "text/html"); - } - - private Outlink[] generateOutlinks() throws Exception { - return generateOutlinks(false); - } - - private Outlink[] generateOutlinks(boolean parts) throws Exception { - Outlink[] outlinks = new Outlink[2]; - - outlinks[0] = new Outlink("http://www.test.com", "test"); - outlinks[1] = new Outlink("http://www.example.com", "example"); - - if (parts) { - outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1", - "test"); - outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2", - "test"); - } - - return outlinks; - } - - @Test - public void testFilterOutlinks() throws Exception { - conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); - filter.setConf(conf); - - Outlink[] outlinks = generateOutlinks(); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", outlinks, metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); - - Assert.assertEquals(1, doc.getField("outlinks").getValues().size()); - - Assert.assertEquals("Filter outlinks, allow only those from a different host", - outlinks[0].getToUrl(), doc.getFieldValue("outlinks")); - } - - @Test - public void testFilterInlinks() throws Exception { - conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); - filter.setConf(conf); - - Inlinks inlinks = new Inlinks(); - inlinks.add(new Inlink("http://www.test.com", "test")); - inlinks.add(new Inlink("http://www.example.com", "example")); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), inlinks); - - Assert.assertEquals(1, doc.getField("inlinks").getValues().size()); - - Assert.assertEquals("Filter inlinks, allow only those from a different host", - "http://www.test.com", doc.getFieldValue("inlinks")); - } - - @Test - public void testNoFilterOutlinks() throws Exception { - filter.setConf(conf); - - Outlink[] outlinks = generateOutlinks(); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", outlinks, metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); - - Assert.assertEquals("All outlinks must be indexed even those from the same host", - outlinks.length, doc.getField("outlinks").getValues().size()); - } - - @Test - public void testNoFilterInlinks() throws Exception { - conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false"); - filter.setConf(conf); - - Inlinks inlinks = new Inlinks(); - inlinks.add(new Inlink("http://www.test.com", "test")); - inlinks.add(new Inlink("http://www.example.com", "example")); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), inlinks); - - Assert.assertEquals("All inlinks must be indexed even those from the same host", - inlinks.size(), doc.getField("inlinks").getValues().size()); - } - - @Test - public void testIndexOnlyHostPart() throws Exception { - conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); - conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); - conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); - filter.setConf(conf); - - Outlink[] outlinks = generateOutlinks(true); - - Inlinks inlinks = new Inlinks(); - inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test")); - inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test")); - inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", - "example")); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", outlinks, metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), inlinks); - - NutchField docOutlinks = doc.getField("outlinks"); - - Assert.assertEquals("Only the host portion of the outlink URL must be indexed", - new URL("http://www.test.com").getHost(), - docOutlinks.getValues().get(0)); - - Assert.assertEquals( - "The inlinks coming from the same host must count only once", 1, - doc.getField("inlinks").getValues().size()); - - Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", - new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks")); - } - - @Test - public void testIndexHostsOnlyAndFilterOutlinks() throws Exception { - conf = NutchConfiguration.create(); - conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); - conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); - - Outlink[] outlinks = generateOutlinks(true); - - filter.setConf(conf); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", outlinks, metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); - - Assert.assertEquals(1, doc.getField("outlinks").getValues().size()); - - Assert.assertEquals( - "Index only the host portion of the outlinks after filtering", - new URL("http://www.test.com").getHost(), - doc.getFieldValue("outlinks")); - } - - @Test - public void testIndexHostsOnlyAndFilterInlinks() throws Exception { - conf = NutchConfiguration.create(); - conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); - conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); - - filter.setConf(conf); - - Inlinks inlinks = new Inlinks(); - inlinks.add(new Inlink("http://www.test.com", "test")); - inlinks.add(new Inlink("http://www.example.com", "example")); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), inlinks); - - Assert.assertEquals(1, doc.getField("inlinks").getValues().size()); - - Assert.assertEquals( - "Index only the host portion of the inlinks after filtering", - new URL("http://www.test.com").getHost(), - doc.getFieldValue("inlinks")); - - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java b/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java deleted file mode 100644 index aaaedbf..0000000 --- a/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse; - -import org.junit.Test; - -import java.util.HashSet; -import java.util.Set; - -import static org.junit.Assert.*; - -public class TestOutlinks { - - @Test - public void testAddSameObject() throws Exception { - Set<Outlink> set = new HashSet<>(); - - Outlink o = new Outlink("http://www.example.com", "Example"); - set.add(o); - set.add(o); - - assertEquals("Adding the same Outlink twice", 1, set.size()); - } - - @Test - public void testAddOtherObjectWithSameData() throws Exception { - Set<Outlink> set = new HashSet<>(); - - Outlink o = new Outlink("http://www.example.com", "Example"); - Outlink o1 = new Outlink("http://www.example.com", "Example"); - - assertTrue("The two Outlink objects are the same", o.equals(o1)); - - set.add(o); - set.add(o1); - - assertEquals("The set should contain only 1 Outlink", 1, set.size()); - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-metadata/build.xml b/src/plugin/index-metadata/build.xml deleted file mode 100644 index ad96d11..0000000 --- a/src/plugin/index-metadata/build.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="index-metadata" default="jar-core"> - - <import file="../build-plugin.xml"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-metadata/ivy.xml b/src/plugin/index-metadata/ivy.xml deleted file mode 100644 index 24d7606..0000000 --- a/src/plugin/index-metadata/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../../ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-metadata/plugin.xml b/src/plugin/index-metadata/plugin.xml deleted file mode 100644 index 4d4c9a7..0000000 --- a/src/plugin/index-metadata/plugin.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="index-metadata" - name="Index Metadata" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="index-metadata.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - - <extension id="org.apache.nutch.indexer.metadata" - name="Nutch metadata indexer" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="MetadataIndexer" - class="org.apache.nutch.indexer.metadata.MetadataIndexer"/> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java deleted file mode 100644 index 78718aa..0000000 --- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.indexer.metadata; - -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.parse.Parse; - -/** - * Indexer which can be configured to extract metadata from the crawldb, parse - * metadata or content metadata. You can specify the properties "index.db.md", - * "index.parse.md" or "index.content.md" who's values are comma-delimited - * <value>key1,key2,key3</value>. - */ -public class MetadataIndexer implements IndexingFilter { - private Configuration conf; - private String[] dbFieldnames; - private Map<String, String> parseFieldnames; - private String[] contentFieldnames; - private static final String db_CONF_PROPERTY = "index.db.md"; - private static final String parse_CONF_PROPERTY = "index.parse.md"; - private static final String content_CONF_PROPERTY = "index.content.md"; - - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - - // just in case - if (doc == null) - return doc; - - // add the fields from crawldb - if (dbFieldnames != null) { - for (String metatag : dbFieldnames) { - Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); - if (metadata != null) - doc.add(metatag, metadata.toString()); - } - } - - // add the fields from parsemd - if (parseFieldnames != null) { - for (String metatag : parseFieldnames.keySet()) { - for (String value : parse.getData().getParseMeta().getValues(metatag)) { - if (value != null) - doc.add(parseFieldnames.get(metatag), value); - } - } - } - - // add the fields from contentmd - if (contentFieldnames != null) { - for (String metatag : contentFieldnames) { - for (String value : parse.getData().getContentMeta().getValues(metatag)) { - if (value != null) - doc.add(metatag, value); - } - } - } - - return doc; - } - - public void setConf(Configuration conf) { - this.conf = conf; - dbFieldnames = conf.getStrings(db_CONF_PROPERTY); - parseFieldnames = new HashMap<String, String>(); - for (String metatag : conf.getStrings(parse_CONF_PROPERTY)) { - parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag); - } - contentFieldnames = conf.getStrings(content_CONF_PROPERTY); - - // TODO check conflict between field names e.g. could have same label - // from different sources - - } - - public Configuration getConf() { - return this.conf; - } -}
