Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java Thu Jan 29 05:38:59 2015 @@ -18,29 +18,29 @@ package org.apache.nutch.util.domain; /** - * This class represents the last part of the host name, - * which is operated by authoritives, not individuals. This information - * is needed to find the domain name of a host. The domain name of a host - * is defined to be the last part before the domain suffix, w/o subdomain - * names. As an example the domain name of <br><code> http://lucene.apache.org/ - * </code><br> is <code> apache.org</code> - * <br> - * This class holds three fields, - * <strong>domain</strong> field represents the suffix (such as "co.uk") - * <strong>boost</strong> is a float for boosting score of url's with this suffix - * <strong>status</strong> field represents domain's status + * This class represents the last part of the host name, which is operated by + * authoritives, not individuals. This information is needed to find the domain + * name of a host. The domain name of a host is defined to be the last part + * before the domain suffix, w/o subdomain names. As an example the domain name + * of <br> + * <code> http://lucene.apache.org/ + * </code><br> + * is <code> apache.org</code> <br> + * This class holds three fields, <strong>domain</strong> field represents the + * suffix (such as "co.uk") <strong>boost</strong> is a float for boosting score + * of url's with this suffix <strong>status</strong> field represents domain's + * status * * @author Enis Soztutar <[email protected]> - * @see TopLevelDomain - * for info please see conf/domain-suffixes.xml + * @see TopLevelDomain for info please see conf/domain-suffixes.xml */ public class DomainSuffix { /** - * Enumeration of the status of the tld. Please see domain-suffixes.xml. + * Enumeration of the status of the tld. Please see domain-suffixes.xml. */ - public enum Status { INFRASTRUCTURE, SPONSORED, UNSPONSORED - , STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED + public enum Status { + INFRASTRUCTURE, SPONSORED, UNSPONSORED, STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED }; private String domain; @@ -49,7 +49,7 @@ public class DomainSuffix { public static final float DEFAULT_BOOST = 1.0f; public static final Status DEFAULT_STATUS = Status.IN_USE; - + public DomainSuffix(String domain, Status status, float boost) { this.domain = domain; this.status = status; @@ -59,7 +59,7 @@ public class DomainSuffix { public DomainSuffix(String domain) { this(domain, DEFAULT_STATUS, DEFAULT_BOOST); } - + public String getDomain() { return domain; } @@ -71,7 +71,7 @@ public class DomainSuffix { public float getBoost() { return boost; } - + @Override public String toString() { return domain;
Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java Thu Jan 29 05:38:59 2015 @@ -25,57 +25,62 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.util.StringUtils; /** - * Storage class for <code>DomainSuffix</code> objects - * Note: this class is singleton + * Storage class for <code>DomainSuffix</code> objects Note: this class is + * singleton + * * @author Enis Soztutar <[email protected]> */ public class DomainSuffixes { - private static final Logger LOG = LoggerFactory.getLogger(DomainSuffixes.class); - - private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>(); - + private static final Logger LOG = LoggerFactory + .getLogger(DomainSuffixes.class); + + private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>(); + private static DomainSuffixes instance; - + /** private ctor */ private DomainSuffixes() { String file = "domain-suffixes.xml"; - InputStream input = this.getClass().getClassLoader().getResourceAsStream(file); + InputStream input = this.getClass().getClassLoader() + .getResourceAsStream(file); try { new DomainSuffixesReader().read(this, input); - } - catch (Exception ex) { + } catch (Exception ex) { LOG.warn(StringUtils.stringifyException(ex)); } } - + /** * Singleton instance, lazy instantination - * @return returns the domain suffix instance + * + * @return returns the domain suffix instance */ public static DomainSuffixes getInstance() { - if(instance == null) { + if (instance == null) { instance = new DomainSuffixes(); } return instance; } - + void addDomainSuffix(DomainSuffix tld) { domains.put(tld.getDomain(), tld); } /** return whether the extension is a registered domain entry */ public boolean isDomainSuffix(String extension) { - return domains.containsKey(extension); + return domains.containsKey(extension); } - + /** - * Return the {@link DomainSuffix} object for the extension, if - * extension is a top level domain returned object will be an - * instance of {@link TopLevelDomain} - * @param extension of the domain + * Return the {@link DomainSuffix} object for the extension, if extension is a + * top level domain returned object will be an instance of + * {@link TopLevelDomain} + * + * @param extension + * of the domain */ public DomainSuffix get(String extension) { return domains.get(extension); } - + } Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java Thu Jan 29 05:38:59 2015 @@ -36,16 +36,17 @@ import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** - * For parsing xml files containing domain suffix definitions. - * Parsed xml files should validate against - * <code>domain-suffixes.xsd</code> + * For parsing xml files containing domain suffix definitions. Parsed xml files + * should validate against <code>domain-suffixes.xsd</code> + * * @author Enis Soztutar <[email protected]> */ class DomainSuffixesReader { - private static final Logger LOG = LoggerFactory.getLogger(DomainSuffixesReader.class); + private static final Logger LOG = LoggerFactory + .getLogger(DomainSuffixesReader.class); - void read(DomainSuffixes tldEntries, InputStream input) throws IOException{ + void read(DomainSuffixes tldEntries, InputStream input) throws IOException { try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); @@ -54,28 +55,29 @@ class DomainSuffixesReader { Document document = builder.parse(new InputSource(input)); Element root = document.getDocumentElement(); - - if(root != null && root.getTagName().equals("domains")) { - - Element tlds = (Element)root.getElementsByTagName("tlds").item(0); - Element suffixes = (Element)root.getElementsByTagName("suffixes").item(0); - - //read tlds - readITLDs(tldEntries, (Element)tlds.getElementsByTagName("itlds").item(0)); - readGTLDs(tldEntries, (Element)tlds.getElementsByTagName("gtlds").item(0)); - readCCTLDs(tldEntries, (Element)tlds.getElementsByTagName("cctlds").item(0)); - + + if (root != null && root.getTagName().equals("domains")) { + + Element tlds = (Element) root.getElementsByTagName("tlds").item(0); + Element suffixes = (Element) root.getElementsByTagName("suffixes") + .item(0); + + // read tlds + readITLDs(tldEntries, (Element) tlds.getElementsByTagName("itlds") + .item(0)); + readGTLDs(tldEntries, (Element) tlds.getElementsByTagName("gtlds") + .item(0)); + readCCTLDs(tldEntries, (Element) tlds.getElementsByTagName("cctlds") + .item(0)); + readSuffixes(tldEntries, suffixes); - } - else { + } else { throw new IOException("xml file is not valid"); } - } - catch (ParserConfigurationException ex) { + } catch (ParserConfigurationException ex) { LOG.warn(StringUtils.stringifyException(ex)); throw new IOException(ex.getMessage()); - } - catch (SAXException ex) { + } catch (SAXException ex) { LOG.warn(StringUtils.stringifyException(ex)); throw new IOException(ex.getMessage()); } @@ -83,22 +85,24 @@ class DomainSuffixesReader { void readITLDs(DomainSuffixes tldEntries, Element el) { NodeList children = el.getElementsByTagName("tld"); - for(int i=0;i<children.getLength();i++) { - tldEntries.addDomainSuffix(readGTLD((Element)children.item(i), Type.INFRASTRUCTURE)); + for (int i = 0; i < children.getLength(); i++) { + tldEntries.addDomainSuffix(readGTLD((Element) children.item(i), + Type.INFRASTRUCTURE)); } } - + void readGTLDs(DomainSuffixes tldEntries, Element el) { NodeList children = el.getElementsByTagName("tld"); - for(int i=0;i<children.getLength();i++) { - tldEntries.addDomainSuffix(readGTLD((Element)children.item(i), Type.GENERIC)); + for (int i = 0; i < children.getLength(); i++) { + tldEntries.addDomainSuffix(readGTLD((Element) children.item(i), + Type.GENERIC)); } } void readCCTLDs(DomainSuffixes tldEntries, Element el) throws IOException { NodeList children = el.getElementsByTagName("tld"); - for(int i=0;i<children.getLength();i++) { - tldEntries.addDomainSuffix(readCCTLD((Element)children.item(i))); + for (int i = 0; i < children.getLength(); i++) { + tldEntries.addDomainSuffix(readCCTLD((Element) children.item(i))); } } @@ -113,39 +117,40 @@ class DomainSuffixesReader { String domain = el.getAttribute("domain"); Status status = readStatus(el); float boost = readBoost(el); - String countryName = readCountryName(el); - return new TopLevelDomain(domain, status, boost, countryName); + String countryName = readCountryName(el); + return new TopLevelDomain(domain, status, boost, countryName); } - + /** read optional field status */ Status readStatus(Element el) { NodeList list = el.getElementsByTagName("status"); - if(list == null || list.getLength() == 0) + if (list == null || list.getLength() == 0) return DomainSuffix.DEFAULT_STATUS; return Status.valueOf(list.item(0).getFirstChild().getNodeValue()); } - + /** read optional field boost */ float readBoost(Element el) { NodeList list = el.getElementsByTagName("boost"); - if(list == null || list.getLength() == 0) + if (list == null || list.getLength() == 0) return DomainSuffix.DEFAULT_BOOST; return Float.parseFloat(list.item(0).getFirstChild().getNodeValue()); } - - /** read field countryname - */ + + /** + * read field countryname + */ String readCountryName(Element el) throws IOException { NodeList list = el.getElementsByTagName("country"); - if(list == null || list.getLength() == 0) + if (list == null || list.getLength() == 0) throw new IOException("Country name should be given"); return list.item(0).getNodeValue(); } - + void readSuffixes(DomainSuffixes tldEntries, Element el) { NodeList children = el.getElementsByTagName("suffix"); - for(int i=0;i<children.getLength();i++) { - tldEntries.addDomainSuffix(readSuffix((Element)children.item(i))); + for (int i = 0; i < children.getLength(); i++) { + tldEntries.addDomainSuffix(readSuffix((Element) children.item(i))); } } @@ -155,5 +160,5 @@ class DomainSuffixesReader { float boost = readBoost(el); return new DomainSuffix(domain, status, boost); } - + } Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java Thu Jan 29 05:38:59 2015 @@ -18,44 +18,50 @@ package org.apache.nutch.util.domain; /** - * (From wikipedia) A top-level domain (TLD) is the last part of an - * Internet domain name; that is, the letters which follow the final - * dot of any domain name. For example, in the domain name - * <code>www.website.com</code>, the top-level domain is <code>com</code>. - * + * (From wikipedia) A top-level domain (TLD) is the last part of an Internet + * domain name; that is, the letters which follow the final dot of any domain + * name. For example, in the domain name <code>www.website.com</code>, the + * top-level domain is <code>com</code>. + * * @author Enis Soztutar <[email protected]> * * @see <a href="http://www.iana.org/"> iana.org</a> * - * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain"> Top-level_domain</a> + * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain"> + * Top-level_domain</a> */ public class TopLevelDomain extends DomainSuffix { - public enum Type { INFRASTRUCTURE, GENERIC, COUNTRY }; - + public enum Type { + INFRASTRUCTURE, GENERIC, COUNTRY + }; + private Type type; private String countryName = null; - - public TopLevelDomain(String domain, Type type, Status status, float boost){ + + public TopLevelDomain(String domain, Type type, Status status, float boost) { super(domain, status, boost); this.type = type; } - public TopLevelDomain(String domain, Status status, float boost, String countryName){ + public TopLevelDomain(String domain, Status status, float boost, + String countryName) { super(domain, status, boost); this.type = Type.COUNTRY; this.countryName = countryName; } - + public Type getType() { return type; } - /** Returns the country name if TLD is Country Code TLD + /** + * Returns the country name if TLD is Country Code TLD + * * @return country name or null - */ - public String getCountryName(){ + */ + public String getCountryName() { return countryName; } - + } Modified: nutch/trunk/src/java/org/apache/nutch/util/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/package-info.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * Miscellaneous utility classes. */ package org.apache.nutch.util; + Modified: nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original) +++ nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -41,16 +41,17 @@ import java.net.MalformedURLException; /** Adds basic searchable fields to a document. */ public class CCIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory.getLogger(CCIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(CCIndexingFilter.class); /** The name of the document field we use. */ public static String FIELD = "cc"; private Configuration conf; - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) - throws IndexingException { - + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + Metadata metadata = parse.getData().getParseMeta(); // index the license String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL); @@ -81,9 +82,11 @@ public class CCIndexingFilter implements return doc; } - /** Add the features represented by a license URL. Urls are of the form + /** + * Add the features represented by a license URL. Urls are of the form * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a - * license feature. */ + * license feature. + */ public void addUrlFeatures(NutchDocument doc, String urlString) { try { URL url = new URL(urlString); @@ -92,7 +95,7 @@ public class CCIndexingFilter implements StringTokenizer names = new StringTokenizer(url.getPath(), "/-"); if (names.hasMoreTokens()) - names.nextToken(); // throw away "licenses" + names.nextToken(); // throw away "licenses" // add a feature per component after "licenses" while (names.hasMoreTokens()) { @@ -105,7 +108,7 @@ public class CCIndexingFilter implements } } } - + private void addFeature(NutchDocument doc, String feature) { doc.add(FIELD, feature); } Modified: nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original) +++ nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Thu Jan 29 05:38:59 2015 @@ -33,27 +33,25 @@ import javax.xml.parsers.*; import org.xml.sax.InputSource; import org.w3c.dom.*; - /** Adds metadata identifying the Creative Commons license used, if any. */ public class CCParseFilter implements HtmlParseFilter { public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class); - - /** Walks DOM tree, looking for RDF in comments and licenses in anchors.*/ + /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */ public static class Walker { - private URL base; // base url of page - private String rdfLicense; // subject url found, if any - private URL relLicense; // license url found, if any - private URL anchorLicense; // anchor url found, if any - private String workType; // work type URI + private URL base; // base url of page + private String rdfLicense; // subject url found, if any + private URL relLicense; // license url found, if any + private URL anchorLicense; // anchor url found, if any + private String workType; // work type URI private Walker(URL base) { this.base = base; } - /** Scan the document adding attributes to metadata.*/ - public static void walk(Node doc, URL base, Metadata metadata, Configuration conf) - throws ParseException { + /** Scan the document adding attributes to metadata. */ + public static void walk(Node doc, URL base, Metadata metadata, + Configuration conf) throws ParseException { // walk the DOM tree, scanning for license data Walker walker = new Walker(base); @@ -62,13 +60,13 @@ public class CCParseFilter implements Ht // interpret results of walk String licenseUrl = null; String licenseLocation = null; - if (walker.rdfLicense != null) { // 1st choice: subject in RDF + if (walker.rdfLicense != null) { // 1st choice: subject in RDF licenseLocation = "rdf"; licenseUrl = walker.rdfLicense; - } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license + } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license licenseLocation = "rel"; licenseUrl = walker.relLicense.toString(); - } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license + } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license licenseLocation = "a"; licenseUrl = walker.anchorLicense.toString(); } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) { @@ -78,7 +76,8 @@ public class CCParseFilter implements Ht // add license to metadata if (licenseUrl != null) { if (LOG.isInfoEnabled()) { - LOG.info("CC: found "+licenseUrl+" in "+licenseLocation+" of "+base); + LOG.info("CC: found " + licenseUrl + " in " + licenseLocation + + " of " + base); } metadata.add(CreativeCommons.LICENSE_URL, licenseUrl); metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation); @@ -86,36 +85,38 @@ public class CCParseFilter implements Ht if (walker.workType != null) { if (LOG.isInfoEnabled()) { - LOG.info("CC: found "+walker.workType+" in "+base); + LOG.info("CC: found " + walker.workType + " in " + base); } metadata.add(CreativeCommons.WORK_TYPE, walker.workType); } } - /** Scan the document looking for RDF in comments and license elements.*/ + /** Scan the document looking for RDF in comments and license elements. */ private void walk(Node node) { - + // check element nodes for license URL if (node instanceof Element) { - findLicenseUrl((Element)node); + findLicenseUrl((Element) node); } // check comment nodes for license RDF if (node instanceof Comment) { - findRdf(((Comment)node).getData()); + findRdf(((Comment) node).getData()); } // recursively walk child nodes NodeList children = node.getChildNodes(); - for (int i = 0; children != null && i < children.getLength(); i++ ) { + for (int i = 0; children != null && i < children.getLength(); i++) { walk(children.item(i)); } } - /** Extract license url from element, if any. Thse are the href attribute - * of anchor elements with rel="license". These must also point to - * http://creativecommons.org/licenses/. */ + /** + * Extract license url from element, if any. Thse are the href attribute of + * anchor elements with rel="license". These must also point to + * http://creativecommons.org/licenses/. + */ private void findLicenseUrl(Element element) { // only look in Anchor elements if (!"a".equalsIgnoreCase(element.getTagName())) @@ -125,54 +126,52 @@ public class CCParseFilter implements Ht String href = element.getAttribute("href"); if (href == null) return; - + try { - URL url = new URL(base, href); // resolve the url + URL url = new URL(base, href); // resolve the url // check that it's a CC license URL - if ("http".equalsIgnoreCase(url.getProtocol()) && - "creativecommons.org".equalsIgnoreCase(url.getHost()) && - url.getPath() != null && - url.getPath().startsWith("/licenses/") && - url.getPath().length() > "/licenses/".length()) { + if ("http".equalsIgnoreCase(url.getProtocol()) + && "creativecommons.org".equalsIgnoreCase(url.getHost()) + && url.getPath() != null && url.getPath().startsWith("/licenses/") + && url.getPath().length() > "/licenses/".length()) { // check rel="license" String rel = element.getAttribute("rel"); if (rel != null && "license".equals(rel) && this.relLicense == null) { - this.relLicense = url; // found rel license + this.relLicense = url; // found rel license } else if (this.anchorLicense == null) { - this.anchorLicense = url; // found anchor license + this.anchorLicense = url; // found anchor license } } - } catch (MalformedURLException e) { // ignore malformed urls + } catch (MalformedURLException e) { // ignore malformed urls } } - /** Configure a namespace aware XML parser. */ - private static final DocumentBuilderFactory FACTORY - = DocumentBuilderFactory.newInstance(); + /** Configure a namespace aware XML parser. */ + private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory + .newInstance(); static { FACTORY.setNamespaceAware(true); } /** Creative Commons' namespace URI. */ private static final String CC_NS = "http://web.resource.org/cc/"; - + /** Dublin Core namespace URI. */ private static final String DC_NS = "http://purl.org/dc/elements/1.1/"; - + /** RDF syntax namespace URI. */ - private static final String RDF_NS - = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; private void findRdf(String comment) { // first check for likely RDF in comment int rdfPosition = comment.indexOf("RDF"); if (rdfPosition < 0) - return; // no RDF, abort + return; // no RDF, abort int nsPosition = comment.indexOf(CC_NS); if (nsPosition < 0) - return; // no RDF, abort + return; // no RDF, abort // try to parse the XML Document doc; @@ -181,28 +180,30 @@ public class CCParseFilter implements Ht doc = parser.parse(new InputSource(new StringReader(comment))); } catch (Exception e) { if (LOG.isWarnEnabled()) { - LOG.warn("CC: Failed to parse RDF in "+base+": "+e); + LOG.warn("CC: Failed to parse RDF in " + base + ": " + e); } - //e.printStackTrace(); + // e.printStackTrace(); return; } // check that root is rdf:RDF NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF"); if (roots.getLength() != 1) { - if (LOG.isWarnEnabled()) { LOG.warn("CC: No RDF root in "+base); } + if (LOG.isWarnEnabled()) { + LOG.warn("CC: No RDF root in " + base); + } return; } - Element rdf = (Element)roots.item(0); + Element rdf = (Element) roots.item(0); // get cc:License nodes inside rdf:RDF NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License"); for (int i = 0; i < licenses.getLength(); i++) { - Element l = (Element)licenses.item(i); + Element l = (Element) licenses.item(i); // license is rdf:about= attribute from cc:License - this.rdfLicense = l.getAttributeNodeNS(RDF_NS,"about").getValue(); + this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue(); // walk predicates of cc:License NodeList predicates = l.getChildNodes(); @@ -210,17 +211,17 @@ public class CCParseFilter implements Ht Node predicateNode = predicates.item(j); if (!(predicateNode instanceof Element)) continue; - Element predicateElement = (Element)predicateNode; + Element predicateElement = (Element) predicateNode; // extract predicates of cc:xxx predicates if (!CC_NS.equals(predicateElement.getNamespaceURI())) { continue; } - + // add object and predicate to metadata // metadata.put(object, predicate); // if (LOG.isInfoEnabled()) { - // LOG.info("CC: found: "+predicate+"="+object); + // LOG.info("CC: found: "+predicate+"="+object); // } } } @@ -230,10 +231,11 @@ public class CCParseFilter implements Ht for (int i = 0; i < works.getLength(); i++) { // get dc:type nodes from cc:Work NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type"); - + for (int j = 0; j < types.getLength(); j++) { - Element type = (Element)types.item(j); - String workUri = type.getAttributeNodeNS(RDF_NS, "resource").getValue(); + Element type = (Element) types.item(j); + String workUri = type.getAttributeNodeNS(RDF_NS, "resource") + .getValue(); this.workType = WORK_TYPE_NAMES.get(workUri); } } @@ -246,16 +248,20 @@ public class CCParseFilter implements Ht WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image"); WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio"); WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", + "interactive"); WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software"); WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image"); } private Configuration conf; - /** Adds metadata or otherwise modifies a parse of an HTML document, given - * the DOM tree of a page. */ - public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { + /** + * Adds metadata or otherwise modifies a parse of an HTML document, given the + * DOM tree of a page. + */ + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { // get parse obj Parse parse = parseResult.get(content.getUrl()); @@ -266,9 +272,8 @@ public class CCParseFilter implements Ht base = new URL(content.getBaseUrl()); } catch (MalformedURLException e) { Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf()); - parseResult.put(content.getUrl(), - new ParseText(emptyParse.getText()), - emptyParse.getData()); + parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), + emptyParse.getData()); return parseResult; } @@ -277,9 +282,8 @@ public class CCParseFilter implements Ht Walker.walk(doc, base, parse.getData().getParseMeta(), getConf()); } catch (ParseException e) { Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf()); - parseResult.put(content.getUrl(), - new ParseText(emptyParse.getText()), - emptyParse.getData()); + parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), + emptyParse.getData()); return parseResult; } Modified: nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original) +++ nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Thu Jan 29 05:38:59 2015 @@ -30,30 +30,28 @@ import java.io.*; public class TestCCParseFilter { - private static final File testDir = - new File(System.getProperty("test.input")); + private static final File testDir = new File(System.getProperty("test.input")); @Test public void testPages() throws Exception { pageTest(new File(testDir, "anchor.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null); + "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null); // Tika returns <a> whereas parse-html returns <rel> // check later pageTest(new File(testDir, "rel.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc/2.0", "rel", null); + "http://creativecommons.org/licenses/by-nc/2.0", "rel", null); // Tika returns <a> whereas parse-html returns <rdf> // check later pageTest(new File(testDir, "rdf.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text"); + "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text"); } - public void pageTest(File file, String url, - String license, String location, String type) - throws Exception { + public void pageTest(File file, String url, String license, String location, + String type) throws Exception { String contentType = "text/html"; InputStream in = new FileInputStream(file); - ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length()); + ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length()); byte[] buffer = new byte[1024]; int i; while ((i = in.read(buffer)) != -1) { @@ -63,14 +61,13 @@ public class TestCCParseFilter { byte[] bytes = out.toByteArray(); Configuration conf = NutchConfiguration.create(); - Content content = - new Content(url, url, bytes, contentType, new Metadata(), conf); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - + Content content = new Content(url, url, bytes, contentType, new Metadata(), + conf); + Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + Metadata metadata = parse.getData().getParseMeta(); Assert.assertEquals(license, metadata.get("License-Url")); Assert.assertEquals(location, metadata.get("License-Location")); Assert.assertEquals(type, metadata.get("Work-Type")); } } - Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java (original) +++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -38,78 +38,77 @@ import org.apache.nutch.parse.ParseData; * @author mattmann * @since NUTCH-444 * - * An {@link IndexingFilter} implementation to pull out the - * relevant extracted {@link Metadata} fields from the RSS feeds - * and into the index. - * + * An {@link IndexingFilter} implementation to pull out the relevant + * extracted {@link Metadata} fields from the RSS feeds and into the + * index. + * */ public class FeedIndexingFilter implements IndexingFilter { - + public static final String dateFormatStr = "yyyyMMddHHmm"; - + private Configuration conf; - + private final static String PUBLISHED_DATE = "publishedDate"; - + private final static String UPDATED_DATE = "updatedDate"; - + /** * Extracts out the relevant fields: * * <ul> - * <li>FEED_AUTHOR</li> - * <li>FEED_TAGS</li> - * <li>FEED_PUBLISHED</li> - * <li>FEED_UPDATED</li> - * <li>FEED</li> + * <li>FEED_AUTHOR</li> + * <li>FEED_TAGS</li> + * <li>FEED_PUBLISHED</li> + * <li>FEED_UPDATED</li> + * <li>FEED</li> * </ul> * - * And sends them to the {@link Indexer} for indexing within the Nutch - * index. - * + * And sends them to the {@link Indexer} for indexing within the Nutch index. + * */ - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, - Inlinks inlinks) throws IndexingException { + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { ParseData parseData = parse.getData(); Metadata parseMeta = parseData.getParseMeta(); - + String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR); String[] tags = parseMeta.getValues(Feed.FEED_TAGS); String published = parseMeta.get(Feed.FEED_PUBLISHED); String updated = parseMeta.get(Feed.FEED_UPDATED); String feed = parseMeta.get(Feed.FEED); - + if (authors != null) { for (String author : authors) { doc.add(Feed.FEED_AUTHOR, author); } } - + if (tags != null) { for (String tag : tags) { doc.add(Feed.FEED_TAGS, tag); } } - + if (feed != null) doc.add(Feed.FEED, feed); - + if (published != null) { Date date = new Date(Long.parseLong(published)); doc.add(PUBLISHED_DATE, date); } - + if (updated != null) { Date date = new Date(Long.parseLong(updated)); doc.add(UPDATED_DATE, date); } - + return doc; } /** - * @return the {@link Configuration} object used to configure - * this {@link IndexingFilter}. + * @return the {@link Configuration} object used to configure this + * {@link IndexingFilter}. */ public Configuration getConf() { return conf; @@ -119,8 +118,9 @@ public class FeedIndexingFilter implemen * Sets the {@link Configuration} object used to configure this * {@link IndexingFilter}. * - * @param conf The {@link Configuration} object used to configure - * this {@link IndexingFilter}. + * @param conf + * The {@link Configuration} object used to configure this + * {@link IndexingFilter}. */ public void setConf(Configuration conf) { this.conf = conf; Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java (original) +++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * Indexing filter to index meta data from RSS feeds. */ package org.apache.nutch.indexer.feed; + Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java (original) +++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java Thu Jan 29 05:38:59 2015 @@ -66,10 +66,10 @@ import com.sun.syndication.io.SyndFeedIn * @author mattmann * @since NUTCH-444 * - * <p> - * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced links - * and content present in the feed. - * </p> + * <p> + * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced + * links and content present in the feed. + * </p> * */ public class FeedParser implements Parser { @@ -99,8 +99,8 @@ public class FeedParser implements Parse * A {@link Content} object representing the feed that is being * parsed by this {@link Parser}. * - * @return A {@link ParseResult} containing all {@link Parse}d feeds that - * were present in the feed file that this {@link Parser} dealt with. + * @return A {@link ParseResult} containing all {@link Parse}d feeds that were + * present in the feed file that this {@link Parser} dealt with. * */ public ParseResult getParse(Content content) { @@ -111,8 +111,8 @@ public class FeedParser implements Parse detector.autoDetectClues(content, true); String encoding = detector.guessEncoding(content, defaultEncoding); try { - InputSource input = new InputSource(new ByteArrayInputStream(content - .getContent())); + InputSource input = new InputSource(new ByteArrayInputStream( + content.getContent())); input.setEncoding(encoding); SyndFeedInput feedInput = new SyndFeedInput(); feed = feedInput.build(input); @@ -134,8 +134,8 @@ public class FeedParser implements Parse } List<?> entries = feed.getEntries(); - for(Object entry: entries) { - addToMap(parseResult, feed, feedLink, (SyndEntry)entry, content); + for (Object entry : entries) { + addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content); } String feedDesc = stripTags(feed.getDescriptionEx()); @@ -170,8 +170,8 @@ public class FeedParser implements Parse this.parserFactory = new ParserFactory(conf); this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK); this.filters = new URLFilters(conf); - this.defaultEncoding = - conf.get("parser.character.encoding.default", "windows-1252"); + this.defaultEncoding = conf.get("parser.character.encoding.default", + "windows-1252"); } /** @@ -255,8 +255,8 @@ public class FeedParser implements Parse if (text == null) { List<?> contents = entry.getContents(); StringBuilder buf = new StringBuilder(); - for (Object syndContent: contents) { - buf.append(((SyndContent)syndContent).getValue()); + for (Object syndContent : contents) { + buf.append(((SyndContent) syndContent).getValue()); } text = buf.toString(); } @@ -273,9 +273,9 @@ public class FeedParser implements Parse ParseData data = parse.getData(); data.getContentMeta().remove(Response.CONTENT_TYPE); mergeMetadata(data.getParseMeta(), parseMeta); - parseResult.put(link, new ParseText(parse.getText()), new ParseData( - ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data - .getContentMeta(), data.getParseMeta())); + parseResult.put(link, new ParseText(parse.getText()), + new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), + data.getContentMeta(), data.getParseMeta())); } else { contentMeta.remove(Response.CONTENT_TYPE); parseResult.put(link, new ParseText(text), new ParseData( @@ -323,7 +323,7 @@ public class FeedParser implements Parse } } - for (Object i: categories) { + for (Object i : categories) { parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName()); } Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java (original) +++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * Parse RSS feeds. */ package org.apache.nutch.parse.feed; + Modified: nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java (original) +++ nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java Thu Jan 29 05:38:59 2015 @@ -43,7 +43,7 @@ import org.apache.nutch.util.NutchConfig * * @author mattmann * - * Test Suite for the {@link FeedParser}. + * Test Suite for the {@link FeedParser}. * */ public class TestFeedParser { @@ -96,18 +96,17 @@ public class TestFeedParser { Assert.assertEquals(3, parseResult.size()); - boolean hasLink1 = false, hasLink2 = false, hasLink3=false; + boolean hasLink1 = false, hasLink2 = false, hasLink3 = false; for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j .hasNext();) { Map.Entry<Text, Parse> entry = j.next(); - if (entry.getKey().toString().equals( - "http://www-scf.usc.edu/~mattmann/")) { + if (entry.getKey().toString() + .equals("http://www-scf.usc.edu/~mattmann/")) { hasLink1 = true; } else if (entry.getKey().toString().equals("http://www.nutch.org/")) { hasLink2 = true; - } - else if(entry.getKey().toString().equals(urlString)){ + } else if (entry.getKey().toString().equals(urlString)) { hasLink3 = true; } Modified: nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java (original) +++ nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java Thu Jan 29 05:38:59 2015 @@ -38,15 +38,16 @@ public class HeadingsParseFilter impleme * Pattern used to strip surpluss whitespace */ protected static Pattern whitespacePattern = Pattern.compile("\\s+"); - + private Configuration conf; private String[] headings; private boolean multiValued = false; - public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { Parse parse = parseResult.get(content.getUrl()); - for (int i = 0 ; headings != null && i < headings.length ; i++ ) { + for (int i = 0; headings != null && i < headings.length; i++) { List<String> discoveredHeadings = getElement(doc, headings[i]); if (discoveredHeadings.size() > 0) { @@ -89,7 +90,7 @@ public class HeadingsParseFilter impleme if (currentNode.getNodeType() == Node.ELEMENT_NODE) { if (element.equalsIgnoreCase(currentNode.getNodeName())) { headings.add(getNodeValue(currentNode)); - + // Check for multiValued here, if disabled we don't need // to discover more headings. if (!multiValued) { Modified: nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java (original) +++ nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * Parse filter to extract headings (h1, h2, etc.) from DOM parse tree. */ package org.apache.nutch.parse.headings; + Modified: nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -30,13 +30,15 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * Indexing filter that offers an option to either index all inbound anchor text for - * a document or deduplicate anchors. Deduplication does have it's con's, + * Indexing filter that offers an option to either index all inbound anchor text + * for a document or deduplicate anchors. Deduplication does have it's con's, + * * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. */ public class AnchorIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory.getLogger(AnchorIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(AnchorIndexingFilter.class); private Configuration conf; private boolean deduplicate = false; @@ -49,6 +51,7 @@ public class AnchorIndexingFilter implem deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false); LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off")); } + /** * Get the {@link Configuration} object */ @@ -57,28 +60,33 @@ public class AnchorIndexingFilter implem } /** - * The {@link AnchorIndexingFilter} filter object which supports boolean - * configuration settings for the deduplication of anchors. - * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. - * - * @param doc The {@link NutchDocument} object - * @param parse The relevant {@link Parse} object passing through the filter - * @param url URL to be filtered for anchor text - * @param datum The {@link CrawlDatum} entry - * @param inlinks The {@link Inlinks} containing anchor text + * The {@link AnchorIndexingFilter} filter object which supports boolean + * configuration settings for the deduplication of anchors. See + * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered for anchor text + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text * @return filtered NutchDocument */ - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, - Inlinks inlinks) throws IndexingException { + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { - String[] anchors = (inlinks != null ? inlinks.getAnchors() - : new String[0]); + String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]); HashSet<String> set = null; for (int i = 0; i < anchors.length; i++) { if (deduplicate) { - if (set == null) set = new HashSet<String>(); + if (set == null) + set = new HashSet<String>(); String lcAnchor = anchors[i].toLowerCase(); // Check if already processed the current anchor Modified: nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -29,12 +29,11 @@ import org.junit.Assert; import org.junit.Test; /** - * JUnit test case which tests - * 1. that anchor text is obtained - * 2. that anchor deduplication functionality is working + * JUnit test case which tests 1. that anchor text is obtained 2. that anchor + * deduplication functionality is working * * @author lewismc - * + * */ public class TestAnchorIndexingFilter { @@ -52,14 +51,17 @@ public class TestAnchorIndexingFilter { inlinks.add(new Inlink("http://test2.com/", "text2")); inlinks.add(new Inlink("http://test3.com/", "text2")); try { - filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks); - } catch(Exception e){ + filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), + new CrawlDatum(), inlinks); + } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } Assert.assertNotNull(doc); - Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor")); - Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size()); + Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames() + .contains("anchor")); + Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor") + .getValues().size()); } } Modified: nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -39,42 +39,48 @@ import java.util.Date; import org.apache.hadoop.conf.Configuration; -/** - * Adds basic searchable fields to a document. - * The fields added are : domain, host, url, content, title, cache, tstamp - * domain is included depending on {@code indexer.add.domain} in nutch-default.xml. - * title is truncated as per {@code indexer.max.title.length} in nutch-default.xml. - * (As per NUTCH-1004, a zero-length title is not added) - * content is truncated as per {@code indexer.max.content.length} in nutch-default.xml. +/** + * Adds basic searchable fields to a document. The fields added are : domain, + * host, url, content, title, cache, tstamp domain is included depending on + * {@code indexer.add.domain} in nutch-default.xml. title is truncated as per + * {@code indexer.max.title.length} in nutch-default.xml. (As per NUTCH-1004, a + * zero-length title is not added) content is truncated as per + * {@code indexer.max.content.length} in nutch-default.xml. */ public class BasicIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory.getLogger(BasicIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(BasicIndexingFilter.class); private int MAX_TITLE_LENGTH; private int MAX_CONTENT_LENGTH; private boolean addDomain = false; private Configuration conf; - /** - * The {@link BasicIndexingFilter} filter object which supports few - * configuration settings for adding basic searchable fields. - * See {@code indexer.add.domain}, {@code indexer.max.title.length}, - * {@code indexer.max.content.length} in nutch-default.xml. - * - * @param doc The {@link NutchDocument} object - * @param parse The relevant {@link Parse} object passing through the filter - * @param url URL to be filtered for anchor text - * @param datum The {@link CrawlDatum} entry - * @param inlinks The {@link Inlinks} containing anchor text - * @return filtered NutchDocument - */ - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) - throws IndexingException { + /** + * The {@link BasicIndexingFilter} filter object which supports few + * configuration settings for adding basic searchable fields. See + * {@code indexer.add.domain}, {@code indexer.max.title.length}, + * {@code indexer.max.content.length} in nutch-default.xml. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered for anchor text + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); String reprUrlString = reprUrl != null ? reprUrl.toString() : null; String urlString = url.toString(); - + String host = null; try { URL u; @@ -83,11 +89,11 @@ public class BasicIndexingFilter impleme } else { u = new URL(urlString); } - + if (addDomain) { doc.add("domain", URLUtil.getDomainName(u)); } - + host = u.getHost(); } catch (MalformedURLException e) { throw new IndexingException(e); @@ -108,7 +114,10 @@ public class BasicIndexingFilter impleme // title String title = parse.getData().getTitle(); - if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate title if needed + if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate + // title + // if + // needed title = title.substring(0, MAX_TITLE_LENGTH); } Modified: nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -34,20 +34,20 @@ import org.junit.Test; import java.util.Date; /** - * JUnit test case which tests - * 1. that basic searchable fields are added to a document - * 2. that domain is added as per {@code indexer.add.domain} in nutch-default.xml. - * 3. that title is truncated as per {@code indexer.max.title.length} in nutch-default.xml. - * 4. that content is truncated as per {@code indexer.max.content.length} in nutch-default.xml. + * JUnit test case which tests 1. that basic searchable fields are added to a + * document 2. that domain is added as per {@code indexer.add.domain} in + * nutch-default.xml. 3. that title is truncated as per + * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is + * truncated as per {@code indexer.max.content.length} in nutch-default.xml. * * @author tejasp - * + * */ public class TestBasicIndexingFilter { @Test - public void testBasicIndexingFilter() throws Exception { + public void testBasicIndexingFilter() throws Exception { Configuration conf = NutchConfiguration.create(); conf.setInt("indexer.max.title.length", 10); conf.setBoolean("indexer.add.domain", true); @@ -63,8 +63,10 @@ public class TestBasicIndexingFilter { Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") }; Metadata metaData = new Metadata(); metaData.add("Language", "en/us"); - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData); - ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, + outlinks, metaData); + ParseImpl parse = new ParseImpl( + "this is a sample foo bar page. hope you enjoy it.", parseData); CrawlDatum crawlDatum = new CrawlDatum(); crawlDatum.setFetchTime(100L); @@ -72,18 +74,26 @@ public class TestBasicIndexingFilter { Inlinks inlinks = new Inlinks(); try { - filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks); - } catch(Exception e){ + filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), + crawlDatum, inlinks); + } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } Assert.assertNotNull(doc); - Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0)); - Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0)); - Assert.assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0)); - Assert.assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", - doc.getField("url").getValues().get(0)); - Assert.assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0)); - Assert.assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0)); + Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc + .getField("title").getValues().get(0)); + Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc + .getField("domain").getValues().get(0)); + Assert.assertEquals("test host, expect \"nutch.apache.org\"", + "nutch.apache.org", doc.getField("host").getValues().get(0)); + Assert.assertEquals( + "test url, expect \"http://nutch.apache.org/index.html\"", + "http://nutch.apache.org/index.html", doc.getField("url").getValues() + .get(0)); + Assert.assertEquals("test content", "this is a sample foo", + doc.getField("content").getValues().get(0)); + Assert.assertEquals("test fetch time", new Date(100L), + (Date) doc.getField("tstamp").getValues().get(0)); } } Modified: nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java (original) +++ nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java Thu Jan 29 05:38:59 2015 @@ -41,13 +41,16 @@ import com.maxmind.geoip2.record.Subdivi import com.maxmind.geoip2.record.Traits; /** - * <p>Simple utility class which enables efficient, structured - * {@link org.apache.nutch.indexer.NutchDocument} building based on input - * from {@link GeoIPIndexingFilter}, where configuration is also read.</p> - * <p>Based on the nature of the input, this class wraps factory type - * implementations for populating {@link org.apache.nutch.indexer.NutchDocument}'s - * with the correct {@link org.apache.nutch.indexer.NutchField} information. - * + * <p> + * Simple utility class which enables efficient, structured + * {@link org.apache.nutch.indexer.NutchDocument} building based on input from + * {@link GeoIPIndexingFilter}, where configuration is also read. + * </p> + * <p> + * Based on the nature of the input, this class wraps factory type + * implementations for populating {@link org.apache.nutch.indexer.NutchDocument} + * 's with the correct {@link org.apache.nutch.indexer.NutchField} information. + * */ public class GeoIPDocumentCreator { @@ -58,13 +61,15 @@ public class GeoIPDocumentCreator { } public static NutchDocument createDocFromInsightsService(String serverIp, - NutchDocument doc, WebServiceClient client) throws UnknownHostException, IOException, GeoIp2Exception { + NutchDocument doc, WebServiceClient client) throws UnknownHostException, + IOException, GeoIp2Exception { doc.add("ip", serverIp); - InsightsResponse response = client.insights(InetAddress.getByName(serverIp)); - //CityResponse response = client.city(InetAddress.getByName(serverIp)); - + InsightsResponse response = client + .insights(InetAddress.getByName(serverIp)); + // CityResponse response = client.city(InetAddress.getByName(serverIp)); + City city = response.getCity(); - doc.add("cityName", city.getName()); // 'Minneapolis' + doc.add("cityName", city.getName()); // 'Minneapolis' doc.add("cityConfidence", city.getConfidence()); // 50 doc.add("cityGeoNameId", city.getGeoNameId()); @@ -74,31 +79,32 @@ public class GeoIPDocumentCreator { doc.add("continentName", continent.getName()); Country country = response.getCountry(); - doc.add("countryIsoCode", country.getIsoCode()); // 'US' - doc.add("countryName", country.getName()); // 'United States' - doc.add("countryConfidence", country.getConfidence()); // 99 + doc.add("countryIsoCode", country.getIsoCode()); // 'US' + doc.add("countryName", country.getName()); // 'United States' + doc.add("countryConfidence", country.getConfidence()); // 99 doc.add("countryGeoName", country.getGeoNameId()); Location location = response.getLocation(); - doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, -93.2323 - doc.add("accRadius", location.getAccuracyRadius()); // 3 - doc.add("timeZone", location.getTimeZone()); // 'America/Chicago' + doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, + // -93.2323 + doc.add("accRadius", location.getAccuracyRadius()); // 3 + doc.add("timeZone", location.getTimeZone()); // 'America/Chicago' doc.add("metroCode", location.getMetroCode()); Postal postal = response.getPostal(); - doc.add("postalCode", postal.getCode()); // '55455' + doc.add("postalCode", postal.getCode()); // '55455' doc.add("postalConfidence", postal.getConfidence()); // 40 RepresentedCountry rCountry = response.getRepresentedCountry(); doc.add("countryType", rCountry.getType()); Subdivision subdivision = response.getMostSpecificSubdivision(); - doc.add("subDivName", subdivision.getName()); // 'Minnesota' - doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN' + doc.add("subDivName", subdivision.getName()); // 'Minnesota' + doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN' doc.add("subDivConfidence", subdivision.getConfidence()); // 90 doc.add("subDivGeoNameId", subdivision.getGeoNameId()); - Traits traits = response.getTraits(); + Traits traits = response.getTraits(); doc.add("autonSystemNum", traits.getAutonomousSystemNumber()); doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization()); doc.add("domain", traits.getDomain()); @@ -112,20 +118,23 @@ public class GeoIPDocumentCreator { @SuppressWarnings("unused") public static NutchDocument createDocFromCityService(String serverIp, - NutchDocument doc, WebServiceClient client) throws UnknownHostException, IOException, GeoIp2Exception { + NutchDocument doc, WebServiceClient client) throws UnknownHostException, + IOException, GeoIp2Exception { CityResponse response = client.city(InetAddress.getByName(serverIp)); return doc; } @SuppressWarnings("unused") public static NutchDocument createDocFromCountryService(String serverIp, - NutchDocument doc, WebServiceClient client) throws UnknownHostException, IOException, GeoIp2Exception { - CountryResponse response = client.country(InetAddress.getByName(serverIp)); + NutchDocument doc, WebServiceClient client) throws UnknownHostException, + IOException, GeoIp2Exception { + CountryResponse response = client.country(InetAddress.getByName(serverIp)); return doc; } - public static NutchDocument createDocFromIspDb(String serverIp, NutchDocument doc, - DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception { + public static NutchDocument createDocFromIspDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws UnknownHostException, + IOException, GeoIp2Exception { IspResponse response = reader.isp(InetAddress.getByName(serverIp)); doc.add("ip", serverIp); doc.add("autonSystemNum", response.getAutonomousSystemNumber()); @@ -135,8 +144,9 @@ public class GeoIPDocumentCreator { return doc; } - public static NutchDocument createDocFromDomainDb(String serverIp, NutchDocument doc, - DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception { + public static NutchDocument createDocFromDomainDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws UnknownHostException, + IOException, GeoIp2Exception { DomainResponse response = reader.domain(InetAddress.getByName(serverIp)); doc.add("ip", serverIp); doc.add("domain", response.getDomain()); @@ -144,20 +154,23 @@ public class GeoIPDocumentCreator { } public static NutchDocument createDocFromConnectionDb(String serverIp, - NutchDocument doc, DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception { - ConnectionTypeResponse response = reader.connectionType(InetAddress.getByName(serverIp)); + NutchDocument doc, DatabaseReader reader) throws UnknownHostException, + IOException, GeoIp2Exception { + ConnectionTypeResponse response = reader.connectionType(InetAddress + .getByName(serverIp)); doc.add("ip", serverIp); doc.add("connType", response.getConnectionType().toString()); return doc; } - public static NutchDocument createDocFromCityDb(String serverIp, NutchDocument doc, - DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception { + public static NutchDocument createDocFromCityDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws UnknownHostException, + IOException, GeoIp2Exception { doc.add("ip", serverIp); CityResponse response = reader.city(InetAddress.getByName(serverIp)); City city = response.getCity(); - doc.add("cityName", city.getName()); // 'Minneapolis' + doc.add("cityName", city.getName()); // 'Minneapolis' doc.add("cityConfidence", city.getConfidence()); // 50 doc.add("cityGeoNameId", city.getGeoNameId()); @@ -167,27 +180,28 @@ public class GeoIPDocumentCreator { doc.add("continentName", continent.getName()); Country country = response.getCountry(); - doc.add("countryIsoCode", country.getIsoCode()); // 'US' - doc.add("countryName", country.getName()); // 'United States' - doc.add("countryConfidence", country.getConfidence()); // 99 + doc.add("countryIsoCode", country.getIsoCode()); // 'US' + doc.add("countryName", country.getName()); // 'United States' + doc.add("countryConfidence", country.getConfidence()); // 99 doc.add("countryGeoName", country.getGeoNameId()); Location location = response.getLocation(); - doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, -93.2323 - doc.add("accRadius", location.getAccuracyRadius()); // 3 - doc.add("timeZone", location.getTimeZone()); // 'America/Chicago' + doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, + // -93.2323 + doc.add("accRadius", location.getAccuracyRadius()); // 3 + doc.add("timeZone", location.getTimeZone()); // 'America/Chicago' doc.add("metroCode", location.getMetroCode()); Postal postal = response.getPostal(); - doc.add("postalCode", postal.getCode()); // '55455' + doc.add("postalCode", postal.getCode()); // '55455' doc.add("postalConfidence", postal.getConfidence()); // 40 RepresentedCountry rCountry = response.getRepresentedCountry(); doc.add("countryType", rCountry.getType()); Subdivision subdivision = response.getMostSpecificSubdivision(); - doc.add("subDivName", subdivision.getName()); // 'Minnesota' - doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN' + doc.add("subDivName", subdivision.getName()); // 'Minnesota' + doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN' doc.add("subDivConfidence", subdivision.getConfidence()); // 90 doc.add("subDivGeoNameId", subdivision.getGeoNameId()); return doc; Modified: nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -34,16 +34,22 @@ import com.maxmind.geoip2.DatabaseReader import com.maxmind.geoip2.WebServiceClient; /** - * <p>This plugin implements an indexing filter which takes - * advantage of the - * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p> - * <p>The third party library distribution provides an API for the GeoIP2 - * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web services</a> - * and <a href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. - * The API also works with the free - * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.</p> - * <p>Depending on the service level agreement, you have with the GeoIP service provider, - * the plugin can add a number of the following fields to the index data model: + * <p> + * This plugin implements an indexing filter which takes advantage of the <a + * href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>. + * </p> + * <p> + * The third party library distribution provides an API for the GeoIP2 <a + * href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web + * services</a> and <a + * href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. The + * API also works with the free <a + * href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>. + * </p> + * <p> + * Depending on the service level agreement, you have with the GeoIP service + * provider, the plugin can add a number of the following fields to the index + * data model: * <ol> * <li>Continent</li> * <li>Country</li> @@ -56,51 +62,59 @@ import com.maxmind.geoip2.WebServiceClie * <li>Confidence Factors</li> * <li>Radius</li> * <li>User Type</li> - * </ol></p> + * </ol> + * </p> * - * <p>Some of the services are documented at the - * <a href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision Services</a> - * webpage where more information can be obtained.</p> + * <p> + * Some of the services are documented at the <a + * href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision + * Services</a> webpage where more information can be obtained. + * </p> + * + * <p> + * You should also consult the following three properties in + * <code>nutch-site.xml</code> + * </p> * - * <p>You should also consult the following three properties in <code>nutch-site.xml</code></p> * <pre> - * {@code - *<!-- index-geoip plugin properties --> -<property> - <name>index.geoip.usage</name> - <value>insightsService</value> - <description> - A string representing the information source to be used for GeoIP information - association. Either enter 'cityDatabase', 'connectionTypeDatabase', - 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the - Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, - GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath - and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf - </description> -</property> - -<property> - <name>index.geoip.userid</name> - <value></value> - <description> - The userId associated with the GeoIP2 Precision Services account. - </description> -</property> - -<property> - <name>index.geoip.licensekey</name> - <value></value> - <description> - The license key associated with the GeoIP2 Precision Services account. - </description> -</property> -} + * {@code + * <!-- index-geoip plugin properties --> + * <property> + * <name>index.geoip.usage</name> + * <value>insightsService</value> + * <description> + * A string representing the information source to be used for GeoIP information + * association. Either enter 'cityDatabase', 'connectionTypeDatabase', + * 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the + * Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, + * GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath + * and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf + * </description> + * </property> + * + * <property> + * <name>index.geoip.userid</name> + * <value></value> + * <description> + * The userId associated with the GeoIP2 Precision Services account. + * </description> + * </property> + * + * <property> + * <name>index.geoip.licensekey</name> + * <value></value> + * <description> + * The license key associated with the GeoIP2 Precision Services account. + * </description> + * </property> + * } * </pre> * */ public class GeoIPIndexingFilter implements IndexingFilter { - private static final Logger LOG = LoggerFactory.getLogger(GeoIPIndexingFilter.class); + private static final Logger LOG = LoggerFactory + .getLogger(GeoIPIndexingFilter.class); private Configuration conf; @@ -112,7 +126,7 @@ public class GeoIPIndexingFilter impleme DatabaseReader reader = null; - //private AbstractResponse response = null; + // private AbstractResponse response = null; /** * Default constructor for this plugin @@ -145,7 +159,8 @@ public class GeoIPIndexingFilter impleme } } else if (use.equalsIgnoreCase("connectionTypeDatabase")) { try { - geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb").getFile()); + geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb") + .getFile()); buildDb(); } catch (Exception e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); @@ -165,8 +180,8 @@ public class GeoIPIndexingFilter impleme LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); } } else if (use.equalsIgnoreCase("insightsService")) { - client = new WebServiceClient.Builder( - conf.getInt("index.geoip.userid", 12345), conf.get("index.geoip.licensekey")).build(); + client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid", + 12345), conf.get("index.geoip.licensekey")).build(); } usage = use; } @@ -181,7 +196,9 @@ public class GeoIPIndexingFilter impleme /** * - * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument, org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text, org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks) + * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument, + * org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text, + * org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks) */ @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, @@ -189,22 +206,28 @@ public class GeoIPIndexingFilter impleme return addServerGeo(doc, parse.getData(), url.toString()); } - private NutchDocument addServerGeo(NutchDocument doc, ParseData data, String url) { + private NutchDocument addServerGeo(NutchDocument doc, ParseData data, + String url) { if (conf.getBoolean("store.ip.address", false) == true) { try { String serverIp = data.getContentMeta().get("_ip_"); if (serverIp != null) { if (usage.equalsIgnoreCase("cityDatabase")) { - doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc, reader); + doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc, + reader); } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) { - doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc, reader); + doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc, + reader); } else if (usage.equalsIgnoreCase("domainDatabase")) { - doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc, reader); + doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc, + reader); } else if (usage.equalsIgnoreCase("ispDatabase")) { - doc = GeoIPDocumentCreator.createDocFromIspDb(serverIp, doc, reader); + doc = GeoIPDocumentCreator + .createDocFromIspDb(serverIp, doc, reader); } else if (usage.equalsIgnoreCase("insightsService")) { - doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, doc, client); + doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, + doc, client); } } } catch (Exception e) { Modified: nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java (original) +++ nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * Metadata may come from CrawlDb, parse or content metadata. */ package org.apache.nutch.indexer.metadata; +
