Author: snagel
Date: Mon Oct 20 20:44:00 2014
New Revision: 1633222
URL: http://svn.apache.org/r1633222
Log:
NUTCH-1827 Port issues 1467 and 1561 to 2.x
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1633222&r1=1633221&r2=1633222&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Oct 20 20:44:00 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development 2.3-SNAPSHOT
+* NUTCH-1827 Port NUTCH-1467 and NUTCH-1561 to 2.x (snagel)
+
* NUTCH-1876 Upgrade to Crawler Commons 0.5 (jnioche)
* NUTCH-1866 ant eclipse target should not delete runtime (nimafl via lewismc)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1633222&r1=1633221&r2=1633222&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Mon Oct 20 20:44:00 2014
@@ -1125,7 +1125,7 @@
<description>
Comma-separated list of keys to be taken from the metadata to generate
fields.
Can be used e.g. for 'description' or 'keywords' provided that these values
are generated
- by a parser (see parse-metatags plugin)
+ by a parser (see parse-metatags plugin), and property 'metatags.names'.
</description>
</property>
@@ -1133,11 +1133,12 @@
<property>
<name>metatags.names</name>
<value>*</value>
- <description> Names of the metatags to extract, separated by ';'.
- Use '*' to extract all metatags. Prefixes the names with 'meta_'
- in the parse-metadata. For instance to index description and keywords,
- you need to activate the plugin index-metadata and set the value of the
- parameter 'index.metadata' to 'meta_description;meta_keywords'.
+ <description>Names of the metatags to extract, separated by ','.
+ Use '*' to extract all metatags. Prefixes the names with 'meta_' in
+ the parse-metadata. For instance, to index description and keywords,
+ you need to activate the plugins parse-metadata and index-metadata
+ and set the value of the properties 'metatags.names' and
+ 'index.metadata' to 'description,keywords'.
</description>
</property>
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=1633222&r1=1633221&r2=1633222&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java Mon
Oct 20 20:44:00 2014
@@ -21,6 +21,8 @@ import java.net.URL;
import java.util.Iterator;
import java.util.Properties;
+import org.apache.nutch.metadata.Metadata;
+
/**
* This class holds the information about HTML "meta" tags extracted from
* a page. Some special tags have convenience methods for easy checking.
@@ -40,7 +42,7 @@ public class HTMLMetaTags {
private URL refreshHref = null;
- private Properties generalTags = new Properties();
+ private Metadata generalTags = new Metadata();
private Properties httpEquivTags = new Properties();
@@ -166,7 +168,7 @@ public class HTMLMetaTags {
* Returns all collected values of the general meta tags. Property names are
* tag names, property values are "content" values.
*/
- public Properties getGeneralTags() {
+ public Metadata getGeneralTags() {
return generalTags;
}
@@ -188,13 +190,13 @@ public class HTMLMetaTags {
+ ", refreshHref=" + refreshHref + "\n"
);
sb.append(" * general tags:\n");
- Iterator<?> it = generalTags.keySet().iterator();
- while (it.hasNext()) {
- String key = (String)it.next();
+ String[] names = generalTags.names();
+ for (String name : names) {
+ String key = name;
sb.append(" - " + key + "\t=\t" + generalTags.get(key) + "\n");
}
sb.append(" * http-equiv tags:\n");
- it = httpEquivTags.keySet().iterator();
+ Iterator<Object> it = httpEquivTags.keySet().iterator();
while (it.hasNext()) {
String key = (String)it.next();
sb.append(" - " + key + "\t=\t" + httpEquivTags.get(key) + "\n");
Modified:
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java?rev=1633222&r1=1633221&r2=1633222&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
(original)
+++
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
Mon Oct 20 20:44:00 2014
@@ -19,6 +19,10 @@ package org.apache.nutch.indexer.metadat
import java.nio.ByteBuffer;
import java.util.Collection;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
@@ -27,17 +31,18 @@ import org.apache.nutch.indexer.Indexing
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.util.Bytes;
/**
* Indexer which can be configured to extract metadata from the crawldb, parse
* metadata or content metadata. You can specify the properties "index.db",
* "index.parse" or "index.content" who's values are comma-delimited
- * <value>key1, key2, key3</value>.
+ * <value>key1,key2,key3</value>.
*/
public class MetadataIndexer implements IndexingFilter {
private Configuration conf;
- private static String[] parseFieldnames;
+ private static Map<Utf8,String> parseFieldnames;
private static final String PARSE_CONF_PROPERTY = "index.metadata";
private static final String INDEX_PREFIX = "meta_";
private static final String PARSE_META_PREFIX = "meta_";
@@ -51,14 +56,14 @@ public class MetadataIndexer implements
// add the fields from parsemd
if (parseFieldnames != null) {
- for (String metatag : parseFieldnames) {
- ByteBuffer bvalues = page.getMetadata().get(new Utf8(PARSE_META_PREFIX
- + metatag));
+ for (Entry<Utf8,String> metatag : parseFieldnames.entrySet()) {
+ ByteBuffer bvalues = page.getMetadata().get(metatag.getKey());
if (bvalues != null) {
- String value = new String(bvalues.array());
+ String key = metatag.getValue();
+ String value = Bytes.toString(bvalues.array());
String[] values = value.split("\t");
for (String eachvalue : values) {
- doc.add(INDEX_PREFIX + metatag, eachvalue);
+ doc.add(key, eachvalue);
}
}
}
@@ -69,7 +74,13 @@ public class MetadataIndexer implements
public void setConf(Configuration conf) {
this.conf = conf;
- parseFieldnames = conf.getStrings(PARSE_CONF_PROPERTY);
+ String[] metatags = conf.getStrings(PARSE_CONF_PROPERTY);
+ parseFieldnames = new TreeMap<Utf8,String>();
+ for (int i = 0; i < metatags.length; i++) {
+ parseFieldnames.put(
+ new Utf8(PARSE_META_PREFIX + metatags[i].toLowerCase(Locale.ROOT)),
+ INDEX_PREFIX + metatags[i]);
+ }
// TODO check conflict between field names e.g. could have same label
// from different sources
}
Modified:
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?rev=1633222&r1=1633221&r2=1633222&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
(original)
+++
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
Mon Oct 20 20:44:00 2014
@@ -78,7 +78,7 @@ public class HTMLMetaProcessor {
if (nameNode != null) {
if (contentNode != null) {
String name = nameNode.getNodeValue().toLowerCase();
- metaTags.getGeneralTags().setProperty(name,
contentNode.getNodeValue());
+ metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if ("robots".equals(name)) {
if (contentNode != null) {
Modified:
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java?rev=1633222&r1=1633221&r2=1633222&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
(original)
+++
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
Mon Oct 20 20:44:00 2014
@@ -22,6 +22,7 @@ import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
@@ -31,6 +32,7 @@ import org.apache.avro.util.Utf8;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseFilter;
@@ -42,7 +44,7 @@ import org.w3c.dom.DocumentFragment;
/**
* Parse HTML meta tags (keywords, description) and store them in the parse
* metadata so that they can be indexed with the index-metadata plugin with the
- * prefix 'metatag.'
+ * prefix 'metatag.'. Metatags are matched ignoring case.
*/
public class MetaTagsParser implements ParseFilter {
@@ -59,12 +61,9 @@ public class MetaTagsParser implements P
this.conf = conf;
// specify whether we want a specific subset of metadata
// by default take everything we can find
- String metatags = conf.get("metatags.names", "*");
- String[] values = metatags.split(";");
- for (String val : values)
- metatagset.add(val.toLowerCase());
- if(metatagset.size()==0){
- metatagset.add("*");
+ String[] values = conf.getStrings("metatags.names", "*");
+ for (String val : values) {
+ metatagset.add(val.toLowerCase(Locale.ROOT));
}
}
@@ -72,56 +71,53 @@ public class MetaTagsParser implements P
return this.conf;
}
+ /**
+ * Check whether the metatag is in the list of metatags to be indexed (or if
+ * '*' is specified). If yes, add it to parse metadata.
+ */
+ private void addIndexedMetatags(Map<CharSequence, ByteBuffer> metadata,
+ String metatag, String value) {
+ String lcMetatag = metatag.toLowerCase(Locale.ROOT);
+ if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+ }
+ metadata.put(new Utf8(PARSE_META_PREFIX + lcMetatag),
+ ByteBuffer.wrap(value.getBytes()));
+ }
+ }
+
public Parse filter(String url, WebPage page, Parse parse,
HTMLMetaTags metaTags, DocumentFragment doc) {
- Map<Utf8, ByteBuffer> metadata = new HashMap<Utf8, ByteBuffer>();
+ // temporary map: cannot concurrently iterate over and modify page metadata
+ Map<CharSequence, ByteBuffer> metadata = new HashMap<CharSequence,
ByteBuffer>();
// check in the metadata first : the tika-parser
- // might have stored the values there already
- Iterator<Entry<CharSequence, ByteBuffer>> iterator =
page.getMetadata().entrySet().iterator();
- while (iterator.hasNext()) {
- Entry<CharSequence, ByteBuffer> entry = iterator.next();
+ // might have stored the values there already.
+ // Values are then additionally stored with the prefixed key.
+ for (Entry<CharSequence, ByteBuffer> entry :
page.getMetadata().entrySet()) {
String mdName = entry.getKey().toString();
String value = Bytes.toStringBinary(entry.getValue());
- if (metatagset.contains("*") ||
metatagset.contains(mdName.toLowerCase())) {
- // now add the metadata
- LOG.debug("Found meta tag: '" + mdName + "', with value: '" + value
- + "'");
- metadata.put(new Utf8(PARSE_META_PREFIX + mdName.toLowerCase()),
- ByteBuffer.wrap(value.getBytes()));
- }
+ addIndexedMetatags(metadata, mdName, value);
}
- Iterator<Entry<Utf8, ByteBuffer>> itm = metadata.entrySet().iterator();
- while (iterator.hasNext()) {
- Entry<Utf8, ByteBuffer> entry = itm.next();
+
+ // add temporary metadata to page metadata
+ for (Entry<CharSequence, ByteBuffer> entry : metadata.entrySet()) {
page.getMetadata().put(entry.getKey(), entry.getValue());
}
- Properties generalMetaTags = metaTags.getGeneralTags();
- Iterator<Object> it = generalMetaTags.keySet().iterator();
- while (it.hasNext()) {
+ Metadata generalMetaTags = metaTags.getGeneralTags();
+ for (String tagName : generalMetaTags.names()) {
+ // multiple values of a metadata field are separated by '\t' in storage.
StringBuilder sb = new StringBuilder();
- String name = (String) it.next();
- String[] values = new String[] { (String) generalMetaTags.get(name) };
- // The multivalues of a metadata field are saved with a separator '\t'
- // in the storage
- // unless there is only one entry, where no \t is appended.
- for (String value : values) {
- if (values.length > 1) {
- sb.append(value).append("\t");
- } else {
- sb.append(value);
+ for (String value : generalMetaTags.getValues(tagName)) {
+ if (sb.length() > 0) {
+ sb.append("\t");
}
+ sb.append(value);
}
- // check whether the name is in the list of what we want or if
- // specified *
- if (metatagset.contains("*") || metatagset.contains(name.toLowerCase()))
{
- // Add the recently parsed value of multiValued array to metadata
- LOG.debug("Found meta tag : " + name + "\t" + sb.toString());
- page.getMetadata().put(new Utf8(PARSE_META_PREFIX +
name.toLowerCase()),
- ByteBuffer.wrap(Bytes.toBytes(sb.toString())));
- }
+ addIndexedMetatags(page.getMetadata(), tagName, sb.toString());
}
Properties httpequiv = metaTags.getHttpEquivTags();
@@ -129,13 +125,7 @@ public class MetaTagsParser implements P
while (tagNames.hasMoreElements()) {
String name = (String) tagNames.nextElement();
String value = httpequiv.getProperty(name);
- // check whether the name is in the list of what we want or if
- // specified *
- if (metatagset.contains("*") || metatagset.contains(name.toLowerCase()))
{
- LOG.debug("Found meta tag : " + name + "\t" + value);
- page.getMetadata().put(new Utf8(PARSE_META_PREFIX +
name.toLowerCase()),
- ByteBuffer.wrap(value.getBytes()));
- }
+ addIndexedMetatags(page.getMetadata(), name, value);
}
return parse;
Modified:
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java?rev=1633222&r1=1633221&r2=1633222&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
(original)
+++
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
Mon Oct 20 20:44:00 2014
@@ -140,8 +140,7 @@ public class TestMetaTagsParser {
if (nameNode != null) {
if (contentNode != null) {
String name = nameNode.getNodeValue().toLowerCase();
- metaTags.getGeneralTags().setProperty(name,
- contentNode.getNodeValue());
+ metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
}
}
Modified:
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java?rev=1633222&r1=1633221&r2=1633222&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
(original)
+++
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
Mon Oct 20 20:44:00 2014
@@ -78,7 +78,7 @@ public class HTMLMetaProcessor {
if (nameNode != null) {
if (contentNode != null) {
String name = nameNode.getNodeValue().toLowerCase();
- metaTags.getGeneralTags().setProperty(name,
contentNode.getNodeValue());
+ metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if ("robots".equals(name)) {
if (contentNode != null) {