Author: jerome
Date: Sat Dec 17 02:06:31 2005
New Revision: 357334
URL: http://svn.apache.org/viewcvs?rev=357334&view=rev
Log:
NUTCH-3, ContentProperties can handle multivalued properties (S. Groschupf)
Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=357334&r1=357333&r2=357334&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sat Dec 17 02:06:31 2005
@@ -105,7 +105,7 @@
<property>
<name>http.redirect.max</name>
- <value>3</value>
+ <value>10</value>
<description>The maximum number of redirects the fetcher will follow when
trying to fetch a page.</description>
</property>
@@ -727,7 +727,8 @@
<property>
<name>parser.character.encoding.default</name>
- <value>windows-1252</value>
+ <!--value>windows-1252</value-->
+ <value>utf-8</value>
<description>The character encoding to fall back to when no other information
is available</description>
</property>
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=357334&r1=357333&r2=357334&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec
17 02:06:31 2005
@@ -77,11 +77,8 @@
contentType = UTF8.readString(in); // read contentType
- int propertyCount = in.readInt(); // read metadata
metadata = new ContentProperties();
- for (int i = 0; i < propertyCount; i++) {
- metadata.put(UTF8.readString(in), UTF8.readString(in));
- }
+ metadata.readFields(in); // read meta data
}
protected final void writeCompressed(DataOutput out) throws IOException {
@@ -95,13 +92,7 @@
UTF8.writeString(out, contentType); // write contentType
- out.writeInt(metadata.size()); // write metadata
- Iterator i = metadata.entrySet().iterator();
- while (i.hasNext()) {
- Map.Entry e = (Map.Entry)i.next();
- UTF8.writeString(out, (String)e.getKey());
- UTF8.writeString(out, (String)e.getValue());
- }
+ metadata.write(out); // write metadata
}
public static Content read(DataInput in) throws IOException {
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java?rev=357334&r1=357333&r2=357334&view=diff
==============================================================================
---
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
(original)
+++
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
Sat Dec 17 02:06:31 2005
@@ -16,15 +16,22 @@
package org.apache.nutch.protocol;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Properties;
import java.util.TreeMap;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.io.Writable;
+
/**
- * case insensitive properties
+ * writable case insensitive properties
*/
-public class ContentProperties extends TreeMap {
+public class ContentProperties extends TreeMap implements Writable {
/**
* construct the TreeMap with a case insensitive comparator
@@ -51,6 +58,36 @@
return (String) get(key);
}
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.Map#get(java.lang.Object)
+ */
+ public Object get(Object arg0) {
+ Object object = super.get(arg0);
+ if (object != null && object instanceof ArrayList) {
+ ArrayList list = (ArrayList) object;
+ return list.get(list.size() - 1);
+ }
+ return object;
+ }
+
+ /**
+ * @param key
+ * @return the properties as a string array if there is no such property we
+ * retunr a array with 0 entries
+ */
+ public String[] getProperties(String key) {
+ Object object = super.get(key);
+ if (object != null && !(object instanceof ArrayList)) {
+ return new String[] { (String) object };
+ } else if (object != null && object instanceof ArrayList) {
+ ArrayList list = (ArrayList) object;
+ return (String[]) list.toArray(new String[list.size()]);
+ }
+ return new String[0];
+ }
+
/**
* sets the key value tuple
*
@@ -58,7 +95,17 @@
* @param value
*/
public void setProperty(String key, String value) {
- put(key, value);
+ Object object = super.get(key);
+ if (object != null && !(object instanceof ArrayList)) {
+ ArrayList arrayList = new ArrayList();
+ arrayList.add(object);
+ arrayList.add(value);
+ put(key, arrayList);
+ } else if (object instanceof ArrayList) {
+ ((ArrayList) object).add(value);
+ } else {
+ put(key, value);
+ }
}
@@ -83,6 +130,74 @@
return fIterator.next();
}
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.nutch.io.Writable#write(java.io.DataOutput)
+ */
+ public final void write(DataOutput out) throws IOException {
+ out.writeInt(keySet().size());
+ Iterator iterator = keySet().iterator();
+ String key;
+ String[] properties;
+ while (iterator.hasNext()) {
+ key = (String) iterator.next();
+ UTF8.writeString(out, key);
+ properties = getProperties(key);
+ out.writeInt(properties.length);
+ for (int i = 0; i < properties.length; i++) {
+ UTF8.writeString(out, properties[i]);
+ }
+ }
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.nutch.io.Writable#readFields(java.io.DataInput)
+ */
+ public final void readFields(DataInput in) throws IOException {
+ int keySize = in.readInt();
+ String key;
+ for (int i = 0; i < keySize; i++) {
+ key = UTF8.readString(in);
+ int valueSize = in.readInt();
+ for (int j = 0; j < valueSize; j++) {
+ setProperty(key, UTF8.readString(in));
+ }
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ public boolean equals(Object obj) {
+ if (!(obj instanceof ContentProperties)) {
+ return false;
+ }
+ ContentProperties properties = (ContentProperties) obj;
+ Enumeration enumeration = properties.propertyNames();
+ while (enumeration.hasMoreElements()) {
+ String key = (String) enumeration.nextElement();
+ String[] values = properties.getProperties(key);
+ String[] myValues = getProperties(key);
+ if (values.length != myValues.length) {
+ return false;
+ }
+ for (int i = 0; i < values.length; i++) {
+ if (!values[i].equals(myValues[i])) {
+ return false;
+ }
+
+ }
+ }
+
+ return true;
}
}