Author: tejasp
Date: Thu Jan 2 19:30:07 2014
New Revision: 1554881
URL: http://svn.apache.org/r1554881
Log:
NUTCH-1080 Type safe members, arguments for better readability
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1554881&r1=1554880&r2=1554881&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jan 2 19:30:07 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1080 Type safe members, arguments for better readability (tejasp)
+
* NUTCH-1360 Suport the storing of IP address connected to when web crawling
(lewismc, ferdy and Yasin Kılınç)
* NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly
(İlhami KALKAN, snagel via markus)
Modified:
nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java?rev=1554881&r1=1554880&r2=1554881&view=diff
==============================================================================
---
nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
(original)
+++
nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
Thu Jan 2 19:30:07 2014
@@ -124,7 +124,6 @@ public class FeedParser implements Parse
.getEmptyParseResult(content.getUrl(), getConf());
}
- List entries = feed.getEntries();
String feedLink = feed.getLink();
try {
feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
@@ -134,9 +133,9 @@ public class FeedParser implements Parse
feedLink = null;
}
- for (Iterator i = entries.iterator(); i.hasNext();) {
- SyndEntry entry = (SyndEntry) i.next();
- addToMap(parseResult, feed, feedLink, entry, content);
+ List<?> entries = feed.getEntries();
+ for(Object entry: entries) {
+ addToMap(parseResult, feed, feedLink, (SyndEntry)entry, content);
}
String feedDesc = stripTags(feed.getDescriptionEx());
@@ -254,11 +253,10 @@ public class FeedParser implements Parse
text = description.getValue();
if (text == null) {
- List contents = entry.getContents();
+ List<?> contents = entry.getContents();
StringBuilder buf = new StringBuilder();
- for (Iterator i = contents.iterator(); i.hasNext();) {
- SyndContent syndContent = (SyndContent) i.next();
- buf.append(syndContent.getValue());
+ for (Object syndContent: contents) {
+ buf.append(((SyndContent)syndContent).getValue());
}
text = buf.toString();
}
@@ -304,7 +302,7 @@ public class FeedParser implements Parse
private void addFields(Metadata parseMeta, Metadata contentMeta,
SyndFeed feed, SyndEntry entry) {
- List authors = entry.getAuthors(), categories = entry.getCategories();
+ List<?> authors = entry.getAuthors(), categories = entry.getCategories();
Date published = entry.getPublishedDate(), updated =
entry.getUpdatedDate();
String contentType = null;
@@ -325,8 +323,8 @@ public class FeedParser implements Parse
}
}
- for (Iterator i = categories.iterator(); i.hasNext();) {
- parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i.next()).getName());
+ for (Object i: categories) {
+ parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName());
}
if (published != null) {
@@ -341,7 +339,7 @@ public class FeedParser implements Parse
contentType = description.getType();
} else {
// TODO: What to do if contents.size() > 1?
- List contents = entry.getContents();
+ List<?> contents = entry.getContents();
if (contents.size() > 0) {
contentType = ((SyndContent) contents.get(0)).getType();
}
Modified:
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java?rev=1554881&r1=1554880&r2=1554881&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
(original)
+++
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
Thu Jan 2 19:30:07 2014
@@ -33,7 +33,7 @@ public interface HttpAuthentication {
*
* @return The credentials value
*/
- public List getCredentials();
+ public List<String> getCredentials();
/**
* Gets the realm used by the HttpAuthentication object during creation.
Modified:
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=1554881&r1=1554880&r2=1554881&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
(original)
+++
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
Thu Jan 2 19:30:07 2014
@@ -19,9 +19,6 @@ package org.apache.nutch.protocol.httpcl
// JDK imports
import java.util.ArrayList;
import java.util.Collection;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.TreeMap;
// Slf4j Logging imports
import org.slf4j.Logger;
@@ -40,102 +37,59 @@ import org.apache.nutch.metadata.Metadat
* with the ability to authenticate when prompted. The goal is to provide
* multiple authentication types but for now just the {@link
HttpBasicAuthentication} authentication
* type is provided.
- *
+ *
* @see HttpBasicAuthentication
* @see Http
* @see HttpResponse
- *
+ *
* @author Matt Tencati
*/
public class HttpAuthenticationFactory implements Configurable {
- /**
- * The HTTP Authentication (WWW-Authenticate) header which is returned
- * by a webserver requiring authentication.
- */
- public static final String WWW_AUTHENTICATE = "WWW-Authenticate";
-
- public static final Logger LOG =
LoggerFactory.getLogger(HttpAuthenticationFactory.class);
-
- private static Map auths = new TreeMap();
-
- private Configuration conf = null;
-
-
- public HttpAuthenticationFactory(Configuration conf) {
- setConf(conf);
- }
+ /**
+ * The HTTP Authentication (WWW-Authenticate) header which is returned
+ * by a webserver requiring authentication.
+ */
+ public static final String WWW_AUTHENTICATE = "WWW-Authenticate";
-
- /* ---------------------------------- *
- * <implementation:Configurable> *
- * ---------------------------------- */
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- //if (conf.getBoolean("http.auth.verbose", false)) {
- // LOG.setLevel(Level.FINE);
- //} else {
- // LOG.setLevel(Level.WARNING);
- //}
- }
+ public static final Logger LOG =
LoggerFactory.getLogger(HttpAuthenticationFactory.class);
- public Configuration getConf() {
- return conf;
- }
-
- /* ---------------------------------- *
- * <implementation:Configurable> *
- * ---------------------------------- */
-
-
- public HttpAuthentication findAuthentication(Metadata header) {
-
- if (header == null) return null;
-
- try {
- Collection challenge = null;
- if (header instanceof Metadata) {
- Object o = header.get(WWW_AUTHENTICATE);
- if (o instanceof Collection) {
- challenge = (Collection) o;
- } else {
- challenge = new ArrayList();
- challenge.add(o.toString());
- }
- } else {
- String challengeString =
header.get(WWW_AUTHENTICATE);
- if (challengeString != null) {
- challenge = new ArrayList();
- challenge.add(challengeString);
- }
- }
- if (challenge == null) {
- if (LOG.isTraceEnabled()) {
- LOG.trace("Authentication challenge is null");
- }
- return null;
- }
-
- Iterator i = challenge.iterator();
- HttpAuthentication auth = null;
- while (i.hasNext() && auth == null) {
- String challengeString = (String)i.next();
- if (challengeString.equals("NTLM")) {
- challengeString="Basic realm=techweb";
- }
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("Checking challengeString=" +
challengeString);
- }
- auth =
HttpBasicAuthentication.getAuthentication(challengeString, conf);
- if (auth != null) return auth;
-
- //TODO Add additional Authentication lookups
here
- }
- } catch (Exception e) {
- LOG.error("Error: ", e);
- }
- return null;
+ private Configuration conf = null;
+
+ public HttpAuthenticationFactory(Configuration conf) {
+ setConf(conf);
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public HttpAuthentication findAuthentication(Metadata header) {
+
+ if (header == null) return null;
+
+ try {
+ Collection<String> challenge = new ArrayList<String>();
+ challenge.add(header.get(WWW_AUTHENTICATE));
+
+ for(String challengeString: challenge) {
+ if (challengeString.equals("NTLM"))
+ challengeString="Basic realm=techweb";
+
+ if (LOG.isTraceEnabled())
+ LOG.trace("Checking challengeString=" + challengeString);
+
+ HttpAuthentication auth =
HttpBasicAuthentication.getAuthentication(challengeString, conf);
+ if (auth != null) return auth;
+
+ //TODO Add additional Authentication lookups here
+ }
+ } catch (Exception e) {
+ LOG.error("Error: ", e);
}
+ return null;
+ }
}
Modified:
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java?rev=1554881&r1=1554880&r2=1554881&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
(original)
+++
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
Thu Jan 2 19:30:07 2014
@@ -50,11 +50,11 @@ public class HttpBasicAuthentication imp
private static Pattern basic = Pattern.compile("[bB][aA][sS][iI][cC]
[rR][eE][aA][lL][mM]=\"(\\w*)\"");
- private static Map authMap = new TreeMap();
+ private static Map<String, HttpBasicAuthentication> authMap = new
TreeMap<String, HttpBasicAuthentication>();
private Configuration conf = null;
private String challenge = null;
- private ArrayList credentials = null;
+ private ArrayList<String> credentials = null;
private String realm = null;
@@ -70,7 +70,7 @@ public class HttpBasicAuthentication imp
setConf(conf);
this.challenge = challenge;
- credentials = new ArrayList();
+ credentials = new ArrayList<String>();
String username = this.conf.get("http.auth.basic." + challenge +
".user");
String password = this.conf.get("http.auth.basic." + challenge +
".password");
@@ -126,7 +126,7 @@ public class HttpBasicAuthentication imp
* @return Credentials in the form of <code>Authorization: Basic
<Base64 encoded userid:password>
*
*/
- public List getCredentials() {
+ public List<String> getCredentials() {
return credentials;
}
Modified:
nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?rev=1554881&r1=1554880&r2=1554881&view=diff
==============================================================================
---
nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
(original)
+++
nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
Thu Jan 2 19:30:07 2014
@@ -87,10 +87,10 @@ public class OPICScoringFilter implement
}
/** Increase the score by a sum of inlinked scores. */
- public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List
inlinked) throws ScoringFilterException {
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
List<CrawlDatum> inlinked) throws ScoringFilterException {
float adjust = 0.0f;
for (int i = 0; i < inlinked.size(); i++) {
- CrawlDatum linked = (CrawlDatum)inlinked.get(i);
+ CrawlDatum linked = inlinked.get(i);
adjust += linked.getScore();
}
if (old == null) old = datum;
Modified:
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=1554881&r1=1554880&r2=1554881&view=diff
==============================================================================
---
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
(original)
+++
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
Thu Jan 2 19:30:07 2014
@@ -49,7 +49,7 @@ public class CollectionManager extends C
static final Logger LOG = LoggerFactory.getLogger(CollectionManager.class);
- transient Map collectionMap = new HashMap();
+ transient Map<String, Subcollection> collectionMap = new HashMap<String,
Subcollection>();
transient URL configfile;
Modified:
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=1554881&r1=1554880&r2=1554881&view=diff
==============================================================================
---
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
(original)
+++
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
Thu Jan 2 19:30:07 2014
@@ -18,6 +18,7 @@ package org.apache.nutch.collection;
import java.util.ArrayList;
import java.util.Iterator;
+import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
@@ -41,9 +42,8 @@ public class Subcollection extends Confi
public static final String TAG_KEY="key";
public static final String TAG_ID="id";
- ArrayList blackList = new ArrayList();
-
- ArrayList whiteList = new ArrayList();
+ List<String> blackList = new ArrayList<String>();
+ List<String> whiteList = new ArrayList<String>();
/**
* SubCollection identifier
@@ -121,7 +121,7 @@ public class Subcollection extends Confi
*
* @return Whitelist entries
*/
- public ArrayList getWhiteList() {
+ public List<String> getWhiteList() {
return whiteList;
}
@@ -147,7 +147,7 @@ public class Subcollection extends Confi
* @param whiteList
* The whiteList to set.
*/
- public void setWhiteList(ArrayList whiteList) {
+ public void setWhiteList(ArrayList<String> whiteList) {
this.whiteList = whiteList;
}
@@ -165,10 +165,10 @@ public class Subcollection extends Confi
*/
public String filter(String urlString) {
// first the blacklist
- Iterator i = blackList.iterator();
+ Iterator<String> i = blackList.iterator();
while (i.hasNext()) {
String row = (String) i.next();
- if (urlString.indexOf(row) != -1)
+ if (urlString.contains(row))
return null;
}
@@ -176,7 +176,7 @@ public class Subcollection extends Confi
i = whiteList.iterator();
while (i.hasNext()) {
String row = (String) i.next();
- if (urlString.indexOf(row) != -1)
+ if (urlString.contains(row))
return urlString;
}
return null;
@@ -218,7 +218,7 @@ public class Subcollection extends Confi
* @param list
* @param text
*/
- protected void parseList(ArrayList list, String text) {
+ protected void parseList(List<String> list, String text) {
list.clear();
StringTokenizer st = new StringTokenizer(text, "\n\r");
Modified:
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java?rev=1554881&r1=1554880&r2=1554881&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
(original)
+++
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
Thu Jan 2 19:30:07 2014
@@ -150,7 +150,7 @@ public class URLMetaScoringFilter extend
/** Boilerplate */
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
- List inlinked) throws ScoringFilterException {
+ List<CrawlDatum> inlinked) throws ScoringFilterException {
return;
}
Modified:
nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java?rev=1554881&r1=1554880&r2=1554881&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
(original)
+++
nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
Thu Jan 2 19:30:07 2014
@@ -24,33 +24,12 @@ import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
-import java.io.Reader;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
+import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLNormalizers;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
import org.apache.nutch.util.NutchConfiguration;
import junit.framework.TestCase;
@@ -61,7 +40,7 @@ public class TestRegexURLNormalizer exte
private RegexURLNormalizer normalizer;
private Configuration conf;
- private HashMap testData = new HashMap();
+ private Map<String, NormalizedURL[]> testData = new HashMap<String,
NormalizedURL[]>();
// This system property is defined in ./src/plugin/build-plugin.xml
private String sampleDir = System.getProperty("test.data", ".");
@@ -100,9 +79,9 @@ public class TestRegexURLNormalizer exte
}
public void testNormalizerScope() throws Exception {
- Iterator it = testData.keySet().iterator();
+ Iterator<String> it = testData.keySet().iterator();
while (it.hasNext()) {
- String scope = (String)it.next();
+ String scope = it.next();
normalizeTest((NormalizedURL[])testData.get(scope), scope);
}
}
@@ -146,7 +125,7 @@ public class TestRegexURLNormalizer exte
private NormalizedURL[] readTestFile(String scope) throws IOException {
File f = new File(sampleDir, "regex-normalize-" + scope + ".test");
BufferedReader in = new BufferedReader(new InputStreamReader(new
FileInputStream(f), "UTF-8"));
- List list = new ArrayList();
+ List<NormalizedURL> list = new ArrayList<NormalizedURL>();
String line;
while((line = in.readLine()) != null) {
if ( line.trim().length() == 0 ||