Author: jerome Date: Sat Dec 17 02:06:31 2005 New Revision: 357334 URL: http://svn.apache.org/viewcvs?rev=357334&view=rev Log: NUTCH-3, ContentProperties can handle multivalued properties (S. Groschupf)
Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=357334&r1=357333&r2=357334&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Sat Dec 17 02:06:31 2005 @@ -105,7 +105,7 @@ <property> <name>http.redirect.max</name> - <value>3</value> + <value>10</value> <description>The maximum number of redirects the fetcher will follow when trying to fetch a page.</description> </property> @@ -727,7 +727,8 @@ <property> <name>parser.character.encoding.default</name> - <value>windows-1252</value> + <!--value>windows-1252</value--> + <value>utf-8</value> <description>The character encoding to fall back to when no other information is available</description> </property> Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=357334&r1=357333&r2=357334&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 17 02:06:31 2005 @@ -77,11 +77,8 @@ contentType = UTF8.readString(in); // read contentType - int propertyCount = in.readInt(); // read metadata metadata = new ContentProperties(); - for (int i = 0; i < propertyCount; i++) { - metadata.put(UTF8.readString(in), UTF8.readString(in)); - } + metadata.readFields(in); // read meta data } protected final void writeCompressed(DataOutput out) throws IOException { @@ -95,13 +92,7 @@ UTF8.writeString(out, contentType); // write contentType - out.writeInt(metadata.size()); // write metadata - Iterator i = metadata.entrySet().iterator(); - while (i.hasNext()) { - Map.Entry e = (Map.Entry)i.next(); - UTF8.writeString(out, (String)e.getKey()); - UTF8.writeString(out, (String)e.getValue()); - } + metadata.write(out); // write metadata } public static Content read(DataInput in) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java?rev=357334&r1=357333&r2=357334&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java Sat Dec 17 02:06:31 2005 @@ -16,15 +16,22 @@ package org.apache.nutch.protocol; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; import java.util.Enumeration; import java.util.Iterator; import java.util.Properties; import java.util.TreeMap; +import org.apache.nutch.io.UTF8; +import org.apache.nutch.io.Writable; + /** - * case insensitive properties + * writable case insensitive properties */ -public class ContentProperties extends TreeMap { +public class ContentProperties extends TreeMap implements Writable { /** * construct the TreeMap with a case insensitive comparator @@ -51,6 +58,36 @@ return (String) get(key); } + /* + * (non-Javadoc) + * + * @see java.util.Map#get(java.lang.Object) + */ + public Object get(Object arg0) { + Object object = super.get(arg0); + if (object != null && object instanceof ArrayList) { + ArrayList list = (ArrayList) object; + return list.get(list.size() - 1); + } + return object; + } + + /** + * @param key + * @return the properties as a string array if there is no such property we + * retunr a array with 0 entries + */ + public String[] getProperties(String key) { + Object object = super.get(key); + if (object != null && !(object instanceof ArrayList)) { + return new String[] { (String) object }; + } else if (object != null && object instanceof ArrayList) { + ArrayList list = (ArrayList) object; + return (String[]) list.toArray(new String[list.size()]); + } + return new String[0]; + } + /** * sets the key value tuple * @@ -58,7 +95,17 @@ * @param value */ public void setProperty(String key, String value) { - put(key, value); + Object object = super.get(key); + if (object != null && !(object instanceof ArrayList)) { + ArrayList arrayList = new ArrayList(); + arrayList.add(object); + arrayList.add(value); + put(key, arrayList); + } else if (object instanceof ArrayList) { + ((ArrayList) object).add(value); + } else { + put(key, value); + } } @@ -83,6 +130,74 @@ return fIterator.next(); } + } + + /* + * (non-Javadoc) + * + * @see org.apache.nutch.io.Writable#write(java.io.DataOutput) + */ + public final void write(DataOutput out) throws IOException { + out.writeInt(keySet().size()); + Iterator iterator = keySet().iterator(); + String key; + String[] properties; + while (iterator.hasNext()) { + key = (String) iterator.next(); + UTF8.writeString(out, key); + properties = getProperties(key); + out.writeInt(properties.length); + for (int i = 0; i < properties.length; i++) { + UTF8.writeString(out, properties[i]); + } + } + + } + + /* + * (non-Javadoc) + * + * @see org.apache.nutch.io.Writable#readFields(java.io.DataInput) + */ + public final void readFields(DataInput in) throws IOException { + int keySize = in.readInt(); + String key; + for (int i = 0; i < keySize; i++) { + key = UTF8.readString(in); + int valueSize = in.readInt(); + for (int j = 0; j < valueSize; j++) { + setProperty(key, UTF8.readString(in)); + } + } + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (!(obj instanceof ContentProperties)) { + return false; + } + ContentProperties properties = (ContentProperties) obj; + Enumeration enumeration = properties.propertyNames(); + while (enumeration.hasMoreElements()) { + String key = (String) enumeration.nextElement(); + String[] values = properties.getProperties(key); + String[] myValues = getProperties(key); + if (values.length != myValues.length) { + return false; + } + for (int i = 0; i < values.length; i++) { + if (!values[i].equals(myValues[i])) { + return false; + } + + } + } + + return true; } }