[nutch] 02/14: NUTCH-2630 Fetcher to log skipped records by robots.txt - change required log level to INFO (default) for messages reporting skipped URLs because of robots.txt rules (disallow or crawl

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 524a59480a3e258a0363faf343fa57875f8f9ea8
Author: Sebastian Nagel 
AuthorDate: Mon Oct 8 14:50:51 2018 +0200

NUTCH-2630 Fetcher to log skipped records by robots.txt
- change required log level to INFO (default) for messages
  reporting skipped URLs because of robots.txt rules
  (disallow or crawl delay larger than fetcher.max.crawl.delay)
---
 src/java/org/apache/nutch/fetcher/FetcherThread.java | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index bfcc374..6ba920e 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -302,9 +302,7 @@ public class FetcherThread extends Thread {
 if (!rules.isAllowed(fit.url.toString())) {
   // unblock
   ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
-  if (LOG.isDebugEnabled()) {
-LOG.debug("Denied by robots.txt: {}", fit.url);
-  }
+  LOG.info("Denied by robots.txt: {}", fit.url);
   output(fit.url, fit.datum, null,
   ProtocolStatus.STATUS_ROBOTS_DENIED,
   CrawlDatum.STATUS_FETCH_GONE);
@@ -315,7 +313,7 @@ public class FetcherThread extends Thread {
   if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) 
{
 // unblock
 ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
-LOG.debug("Crawl-Delay for {} too long ({}), skipping", 
fit.url,
+LOG.info("Crawl-Delay for {} too long ({}), skipping", fit.url,
 rules.getCrawlDelay());
 output(fit.url, fit.datum, null,
 ProtocolStatus.STATUS_ROBOTS_DENIED,



[nutch] 14/14: NUTCH-1842: crawl.gen.delay value is read incorrectly from config Merge pull request #393 from YossiTamari/patch-2

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit f861c8203c8544b91e061964441485bd2f6de145
Merge: 8151237 e6a961c
Author: Sebastian Nagel 
AuthorDate: Thu Nov 15 11:17:37 2018 +0100

NUTCH-1842: crawl.gen.delay value is read incorrectly from config
Merge pull request #393 from YossiTamari/patch-2

 src/java/org/apache/nutch/crawl/Generator.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)



[nutch] 09/14: NUTCH-2651 Upgrade to Tika 1.19.1 (from 1.18) - modified work-around to fix downloading of dependency javax.ws.rs-api-*.jar: define property packaging.type in ivysettings.xml

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 31a1ec4bab4a702fa8876926d54b212cc40acbce
Author: Sebastian Nagel 
AuthorDate: Sun Oct 21 20:49:51 2018 +0200

NUTCH-2651 Upgrade to Tika 1.19.1 (from 1.18)
- modified work-around to fix downloading of dependency 
javax.ws.rs-api-*.jar:
  define property packaging.type in ivysettings.xml
---
 default.properties  | 8 
 ivy/ivysettings.xml | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/default.properties b/default.properties
index e6b3f4e..bb987d9 100644
--- a/default.properties
+++ b/default.properties
@@ -77,14 +77,6 @@ ivy.shared.default.root=${ivy.default.ivy.user.dir}/shared
 
ivy.shared.default.ivy.pattern=[organisation]/[module]/[revision]/[type]s/[artifact].[ext]
 
ivy.shared.default.artifact.pattern=[organisation]/[module]/[revision]/[type]s/[artifact].[ext]
 
-# work-around to fix failing dependency download of
-#  javax.ws.rs-api.jar
-# required by Tika (1.19 and higher)
-# cf. (also affects ant/ivy)
-#  https://github.com/eclipse-ee4j/jaxrs-api/issues/572
-#  https://github.com/gradle/gradle/issues/3065
-packaging.type=jar
-
 #
 # Plugins API
 #
diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml
index d9b5044..a2dc700 100644
--- a/ivy/ivysettings.xml
+++ b/ivy/ivysettings.xml
@@ -38,6 +38,14 @@
 
value="[organisation]/[module]/[revision]/[module]-[revision](-[classifier])"/>
   
+  
+  
   
   
   



[nutch] 01/14: NUTCH-2625 ProtocolFactory.getProtocol(url) may create multiple plugin instances - lock critical block (conditional creation of plugin instance) on object cache object

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit a6f533dfecd688a6c43212b0e826be9a2da5b4ce
Author: Sebastian Nagel 
AuthorDate: Tue Jul 24 16:19:04 2018 +0200

NUTCH-2625 ProtocolFactory.getProtocol(url) may create multiple plugin 
instances
- lock critical block (conditional creation of plugin instance)
  on object cache object
---
 .../org/apache/nutch/protocol/ProtocolFactory.java | 26 --
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/java/org/apache/nutch/protocol/ProtocolFactory.java 
b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
index 87944a8..2d20ecd 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolFactory.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
@@ -81,7 +81,7 @@ public class ProtocolFactory {
* @throws ProtocolNotFound
*   when Protocol can not be found for url
*/
-  public synchronized Protocol getProtocol(URL url)
+  public Protocol getProtocol(URL url)
   throws ProtocolNotFound {
 ObjectCache objectCache = ObjectCache.get(conf);
 try {
@@ -91,19 +91,21 @@ public class ProtocolFactory {
   }
 
   String cacheId = Protocol.X_POINT_ID + protocolName;
-  Protocol protocol = (Protocol) objectCache.getObject(cacheId);
-  if (protocol != null) {
+  synchronized (objectCache) {
+Protocol protocol = (Protocol) objectCache.getObject(cacheId);
+if (protocol != null) {
+  return protocol;
+}
+
+Extension extension = findExtension(protocolName);
+if (extension == null) {
+  throw new ProtocolNotFound(protocolName);
+}
+
+protocol = (Protocol) extension.getExtensionInstance();
+objectCache.setObject(cacheId, protocol);
 return protocol;
   }
-
-  Extension extension = findExtension(protocolName);
-  if (extension == null) {
-throw new ProtocolNotFound(protocolName);
-  }
-
-  protocol = (Protocol) extension.getExtensionInstance();
-  objectCache.setObject(cacheId, protocol);
-  return protocol;
 } catch (PluginRuntimeException e) {
   throw new ProtocolNotFound(url.toString(), e.toString());
 }



[nutch] 12/14: NUTCH-2671 Upgrade to ant ivy library - fix order of ant target dependencies: "compile-core" must come before "resolve-test"

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 393d3e5f96c0f381b904e17e5abcad695f911e5e
Author: Sebastian Nagel 
AuthorDate: Tue Oct 30 16:45:22 2018 +0100

NUTCH-2671 Upgrade to ant ivy library
- fix order of ant target dependencies:
  "compile-core" must come before "resolve-test"
---
 build.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.xml b/build.xml
index e19179e..37c44b8 100644
--- a/build.xml
+++ b/build.xml
@@ -415,7 +415,7 @@
   
   
   
-  
+  
 

[nutch] 03/14: NUTCH-2651 Upgrade core and parse-tika to use Tika 1.19.1 - add work-around to fix downloading of dependency javax.ws.rs-api-*.jar (need to set property packaging.type=jar)

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 2a3b1d15fdebe7ada325b9b955c164270a21e127
Author: Sebastian Nagel 
AuthorDate: Fri Oct 12 13:47:43 2018 +0200

NUTCH-2651 Upgrade core and parse-tika to use Tika 1.19.1
- add work-around to fix downloading of dependency javax.ws.rs-api-*.jar
  (need to set property packaging.type=jar)
---
 default.properties   |   8 +++
 ivy/ivy.xml  |   2 +-
 src/plugin/parse-tika/ivy.xml|   2 +-
 src/plugin/parse-tika/plugin.xml | 112 +++
 4 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/default.properties b/default.properties
index d6f606b..00af414 100644
--- a/default.properties
+++ b/default.properties
@@ -77,6 +77,14 @@ ivy.shared.default.root=${ivy.default.ivy.user.dir}/shared
 
ivy.shared.default.ivy.pattern=[organisation]/[module]/[revision]/[type]s/[artifact].[ext]
 
ivy.shared.default.artifact.pattern=[organisation]/[module]/[revision]/[type]s/[artifact].[ext]
 
+# work-around to fix failing dependency download of
+#  javax.ws.rs-api.jar
+# required by Tika (1.19 and higher)
+# cf. (also affects ant/ivy)
+#  https://github.com/eclipse-ee4j/jaxrs-api/issues/572
+#  https://github.com/gradle/gradle/issues/3065
+packaging.type=jar
+
 #
 # Plugins API
 #
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 5272de6..f1e4a80 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -65,7 +65,7 @@


 
-   
+   

 

diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index 81e7a80..53c7775 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -36,7 +36,7 @@
   
 
   
-
+
   
   
   
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index 398c0e4..7dbe180 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -26,76 +26,79 @@
  
   
   
-  
-  
-  
-  
-  
-  
-  
+  
+  
+  
+  
+  
+  
+  
   
   
   
   
-  
-  
-  
-  
+  
+  
+  
+  
   
   
-  
   
-  
   
-  
-  
-  
-  
+  
+  
+  
+  
   
   
-  
+  
+  
   
   
-  
+  
   
-  
+  
   
-  
-  
+  
+  
+  
   
-  
-  
-  
-  
+  
+  
+  
+  
   
-  
-  
-  
+  
+  
+  
+  
+  
+  
   
-  
+  
   
   
-  
-  
-  
-  
+  
+  
+  
+  
   
   
-  
-  
+  
+  
   
-  
-  
+  
+  
   
-  
   
-  
-  
-  
-  
-  
-  
-  
+  
+  
+  
+  
+  
+  
+  
+  
   
   
   
@@ -106,23 +109,20 @@
   
   
   
-  
-  
-  
-  
-  
-  
+  
+  
   
-  
+  
+  
   
   
   
   
   
   
-  
-  
-  
+  
+  
+  
   
   
   



[nutch] 13/14: NUTCH-2671 Upgrade to ant ivy library - roll back to 2.4.0 to bring Jenkins build back to normal

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit e6a961ce967e94dc7128154b68cfa24fcd4370e9
Author: Sebastian Nagel 
AuthorDate: Tue Oct 30 17:47:22 2018 +0100

NUTCH-2671 Upgrade to ant ivy library
- roll back to 2.4.0 to bring Jenkins build back to normal
---
 default.properties | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/default.properties b/default.properties
index 1423025..bb987d9 100644
--- a/default.properties
+++ b/default.properties
@@ -63,7 +63,7 @@ runtime.dir=./runtime
 runtime.deploy=${runtime.dir}/deploy
 runtime.local=${runtime.dir}/local
 
-ivy.version=2.5.0-rc1
+ivy.version=2.4.0
 ivy.dir=${basedir}/ivy
 ivy.file=${ivy.dir}/ivy.xml
 ivy.jar=${ivy.dir}/ivy-${ivy.version}.jar



[nutch] 05/14: NUTCH-2655 Update Solr schema.xml for Solr 7.x - add required field types to schema.xml

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit a9ea1f1012f6d1b4296d4728b00cf7498aa05dba
Author: Sebastian Nagel 
AuthorDate: Mon Oct 15 15:04:01 2018 +0200

NUTCH-2655 Update Solr schema.xml for Solr 7.x
- add required field types to schema.xml
---
 conf/schema.xml | 13 +
 1 file changed, 13 insertions(+)

diff --git a/conf/schema.xml b/conf/schema.xml
index 6e7d5bf..2b095e5 100644
--- a/conf/schema.xml
+++ b/conf/schema.xml
@@ -300,6 +300,19 @@
 
 
 
+
+
+
+
+
+
+
+
+
+
+
+
+
 

[nutch] 06/14: NUTCH-2659 Add missing Apache license headers

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 48e1aef83b94468c9f839cf28b24560bef233780
Author: Sebastian Nagel 
AuthorDate: Wed Oct 17 14:23:44 2018 +0200

NUTCH-2659 Add missing Apache license headers
---
 .../org/apache/nutch/indexer/IndexWriterParams.java  | 17 +
 .../apache/nutch/scoring/AbstractScoringFilter.java  | 17 +
 .../apache/nutch/tools/CommonCrawlFormatWARC.java| 17 +
 src/java/org/apache/nutch/tools/WARCUtils.java   | 17 +
 .../nutch/webui/pages/instances/InstancePanel.java   | 17 +
 .../nutch/webui/pages/settings/SettingsPage.java | 17 +
 .../parse/headings/TestHeadingsParseFilter.java  | 17 +
 src/plugin/index-replace/plugin.xml  | 16 
 .../nutch/indexwriter/dummy/DummyConstants.java  | 17 +
 src/plugin/parse-metatags/plugin.xml | 16 
 src/plugin/scoring-depth/build.xml   | 16 
 src/plugin/scoring-depth/plugin.xml  | 16 
 .../nutch/scoring/depth/DepthScoringFilter.java  | 17 +
 .../scoring/similarity/cosine/package-info.java  | 20 +---
 .../apache/nutch/crawl/TODOTestCrawlDbStates.java| 17 +
 15 files changed, 251 insertions(+), 3 deletions(-)

diff --git a/src/java/org/apache/nutch/indexer/IndexWriterParams.java 
b/src/java/org/apache/nutch/indexer/IndexWriterParams.java
index cc91ec0..952dc9e 100644
--- a/src/java/org/apache/nutch/indexer/IndexWriterParams.java
+++ b/src/java/org/apache/nutch/indexer/IndexWriterParams.java
@@ -1,3 +1,20 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.nutch.indexer;
 
 import org.apache.hadoop.util.StringUtils;
diff --git a/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java 
b/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java
index d74c7fb..cd59274 100644
--- a/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java
+++ b/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java
@@ -1,3 +1,20 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.nutch.scoring;
 
 import java.util.Collection;
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java 
b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
index 6f89b16..27f1198 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
@@ -1,3 +1,20 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License 

[nutch] branch master updated (8151237 -> f861c82)

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git.


from 8151237  Merge pull request #387 from 
sebastian-nagel/NUTCH-2630-fetcher-log-robotstxt-denied
 add 8b7298d  NUTCH-1842: crawl.gen.delay value is read incorrectly from 
configuration.
 new a6f533d  NUTCH-2625 ProtocolFactory.getProtocol(url) may create 
multiple plugin instances - lock critical block (conditional creation of plugin 
instance)   on object cache object
 new 524a594  NUTCH-2630 Fetcher to log skipped records by robots.txt - 
change required log level to INFO (default) for messages   reporting skipped 
URLs because of robots.txt rules   (disallow or crawl delay larger than 
fetcher.max.crawl.delay)
 new 2a3b1d1  NUTCH-2651 Upgrade core and parse-tika to use Tika 1.19.1 - 
add work-around to fix downloading of dependency javax.ws.rs-api-*.jar   (need 
to set property packaging.type=jar)
 new 89b16ce  NUTCH-2652 Fetcher launches more fetch tasks than fetch lists 
- properly override method getSplits(...) of FileInputFormat
 new a9ea1f1  NUTCH-2655 Update Solr schema.xml for Solr 7.x - add required 
field types to schema.xml
 new 48e1aef  NUTCH-2659 Add missing Apache license headers
 new d45fb7a  NUTCH-2660 Plugin tests not executed - add missing unit test 
packages to plugin build.xml - tests of "headings" plugin depend on 
"lib-nekohtml" - add "protocol-okhttp" to Javadoc API overview - add missing 
test packages to ant "eclipse" target
 new 2d48152  NUTCH-2661 Move the TestOutlinks class into the o.a.n.parse 
path
 new 31a1ec4  NUTCH-2651 Upgrade to Tika 1.19.1 (from 1.18) - modified 
work-around to fix downloading of dependency javax.ws.rs-api-*.jar:   define 
property packaging.type in ivysettings.xml
 new a5df63a  NUTCH-2658 Adding the fields required by the index-links 
plugin to the schema
 new 93b1a81  NUTCH-2671 Upgrade to ant ivy library - upgrade to 2.5.0-rc1 
to address NUTCH-2669
 new 393d3e5  NUTCH-2671 Upgrade to ant ivy library - fix order of ant 
target dependencies:   "compile-core" must come before "resolve-test"
 new e6a961c  NUTCH-2671 Upgrade to ant ivy library - roll back to 2.4.0 to 
bring Jenkins build back to normal
 new f861c82  NUTCH-1842: crawl.gen.delay value is read incorrectly from 
config Merge pull request #393 from YossiTamari/patch-2

The 14 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/java/org/apache/nutch/crawl/Generator.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)



[nutch] 10/14: NUTCH-2658 Adding the fields required by the index-links plugin to the schema

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit a5df63a3d644e90fb881a0f16c8f29d9320d1de3
Author: Jorge Luis Betancourt 
AuthorDate: Tue Oct 23 22:57:03 2018 +0200

NUTCH-2658 Adding the fields required by the index-links plugin to the 
schema
---
 conf/schema.xml | 4 
 1 file changed, 4 insertions(+)

diff --git a/conf/schema.xml b/conf/schema.xml
index 2b095e5..57a44ac 100644
--- a/conf/schema.xml
+++ b/conf/schema.xml
@@ -398,6 +398,10 @@
 
 
 
+
+
+
+
 
 
 



[nutch] 07/14: NUTCH-2660 Plugin tests not executed - add missing unit test packages to plugin build.xml - tests of "headings" plugin depend on "lib-nekohtml" - add "protocol-okhttp" to Javadoc API ov

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit d45fb7a659ba29371f171817a6a6de72965189c3
Author: Sebastian Nagel 
AuthorDate: Wed Oct 17 14:36:58 2018 +0200

NUTCH-2660 Plugin tests not executed
- add missing unit test packages to plugin build.xml
- tests of "headings" plugin depend on "lib-nekohtml"
- add "protocol-okhttp" to Javadoc API overview
- add missing test packages to ant "eclipse" target
---
 build.xml |  2 ++
 default.properties|  1 +
 src/plugin/build.xml  |  3 +++
 src/plugin/headings/build.xml | 18 ++
 4 files changed, 24 insertions(+)

diff --git a/build.xml b/build.xml
index 785442a..e19179e 100644
--- a/build.xml
+++ b/build.xml
@@ -1061,6 +1061,7 @@
 
 
 
+
 
 
 
@@ -1104,6 +1105,7 @@
 
 
 
+
 
 
 
diff --git a/default.properties b/default.properties
index 00af414..e6b3f4e 100644
--- a/default.properties
+++ b/default.properties
@@ -101,6 +101,7 @@ plugins.protocol=\
org.apache.nutch.protocol.http*:\
org.apache.nutch.protocol.httpclient*:\
org.apache.nutch.protocol.interactiveselenium*:\
+   org.apache.nutch.protocol.okhttp*:\
org.apache.nutch.protocol.selenium*:\
org.apache.nutch.protocol.htmlunit*:\
 
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index d8e2ef5..d8826e8 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -113,9 +113,11 @@
  
  
  
+ 
  
  
  
+ 
  
  
  
@@ -128,6 +130,7 @@
  
  
  
+ 
  
  
  
diff --git a/src/plugin/headings/build.xml b/src/plugin/headings/build.xml
index d334ad1..29288e1 100644
--- a/src/plugin/headings/build.xml
+++ b/src/plugin/headings/build.xml
@@ -19,4 +19,22 @@
 
   
 
+  
+  
+
+  
+
+  
+  
+
+  
+
+  
+
+  
+  
+
+
+  
+
 



[nutch] 08/14: NUTCH-2661 Move the TestOutlinks class into the o.a.n.parse path

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 2d48152db0d032a58ea2324e8b40b6c5c48d7cd6
Author: Jorge Luis Betancourt Gonzalez 
AuthorDate: Wed Oct 17 18:07:51 2018 +0200

NUTCH-2661 Move the TestOutlinks class into the o.a.n.parse path
---
 .../index-links/src => }/test/org/apache/nutch/parse/TestOutlinks.java| 0
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git 
a/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java 
b/src/test/org/apache/nutch/parse/TestOutlinks.java
similarity index 100%
rename from 
src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
rename to src/test/org/apache/nutch/parse/TestOutlinks.java



[nutch] 11/14: NUTCH-2671 Upgrade to ant ivy library - upgrade to 2.5.0-rc1 to address NUTCH-2669

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 93b1a8174254de83232be12ac18d99ca4fa83518
Author: Sebastian Nagel 
AuthorDate: Mon Oct 29 13:41:42 2018 +0100

NUTCH-2671 Upgrade to ant ivy library
- upgrade to 2.5.0-rc1 to address NUTCH-2669
---
 .gitignore | 1 +
 default.properties | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index f44d4e7..732ca05 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,5 @@ logs/
 .project
 ivy/ivy-2.3.0.jar
 ivy/ivy-2.4.0.jar
+ivy/ivy-2.5.0-rc1.jar
 naivebayes-model
diff --git a/default.properties b/default.properties
index bb987d9..1423025 100644
--- a/default.properties
+++ b/default.properties
@@ -63,7 +63,7 @@ runtime.dir=./runtime
 runtime.deploy=${runtime.dir}/deploy
 runtime.local=${runtime.dir}/local
 
-ivy.version=2.4.0
+ivy.version=2.5.0-rc1
 ivy.dir=${basedir}/ivy
 ivy.file=${ivy.dir}/ivy.xml
 ivy.jar=${ivy.dir}/ivy-${ivy.version}.jar



[nutch] 04/14: NUTCH-2652 Fetcher launches more fetch tasks than fetch lists - properly override method getSplits(...) of FileInputFormat

2018-11-15 Thread snagel
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 89b16ce29f3bf6618ec2bf9df0807b24c1e40339
Author: Sebastian Nagel 
AuthorDate: Mon Oct 15 13:44:20 2018 +0200

NUTCH-2652 Fetcher launches more fetch tasks than fetch lists
- properly override method getSplits(...) of FileInputFormat
---
 src/java/org/apache/nutch/fetcher/Fetcher.java | 37 +-
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java 
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index f6584c5..fe9e71e 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -23,28 +23,24 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapred.FileSplit;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
@@ -55,6 +51,8 @@ import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TimingUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * A queue-based fetcher.
@@ -105,19 +103,20 @@ public class Fetcher extends NutchTool implements Tool {
   private static final Logger LOG = LoggerFactory
   .getLogger(MethodHandles.lookup().lookupClass());
 
-  public static class InputFormat extends
-  SequenceFileInputFormat {
-/** Don't split inputs, to keep things polite. */
-public InputSplit[] getSplits(JobContext job, int nSplits) throws 
IOException {
+  public static class InputFormat
+  extends SequenceFileInputFormat {
+/**
+ * Don't split inputs to keep things polite - a single fetch list must be
+ * processed in one fetcher task. Do not split a fetch lists and assigning
+ * the splits to multiple parallel tasks.
+ */
+@Override
+public List getSplits(JobContext job) throws IOException {
   List files = listStatus(job);
-  FileSplit[] splits = new FileSplit[files.size()];
-  Iterator iterator= files.listIterator();
-  int index = 0;
-  while(iterator.hasNext()) {
-index++;
-FileStatus cur = iterator.next();
-splits[index] = new FileSplit(cur.getPath(), 0, cur.getLen(),
-(String[]) null);
+  List splits = new ArrayList<>();
+  for (FileStatus cur : files) {
+splits.add(
+new FileSplit(cur.getPath(), 0, cur.getLen(), (String[]) null));
   }
   return splits;
 }