Author: jerome
Date: Sat Dec 17 02:06:31 2005
New Revision: 357334

URL: http://svn.apache.org/viewcvs?rev=357334&view=rev
Log:
NUTCH-3, ContentProperties can handle multivalued properties (S. Groschupf)

Modified:
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=357334&r1=357333&r2=357334&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sat Dec 17 02:06:31 2005
@@ -105,7 +105,7 @@
 
 <property>
   <name>http.redirect.max</name>
-  <value>3</value>
+  <value>10</value>
   <description>The maximum number of redirects the fetcher will follow when
     trying to fetch a page.</description>
 </property>
@@ -727,7 +727,8 @@
 
 <property>
   <name>parser.character.encoding.default</name>
-  <value>windows-1252</value>
+  <!--value>windows-1252</value-->
+  <value>utf-8</value>
   <description>The character encoding to fall back to when no other information
   is available</description>
 </property>

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=357334&r1=357333&r2=357334&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 
17 02:06:31 2005
@@ -77,11 +77,8 @@
 
     contentType = UTF8.readString(in);            // read contentType
 
-    int propertyCount = in.readInt();             // read metadata
     metadata = new ContentProperties();
-    for (int i = 0; i < propertyCount; i++) {
-      metadata.put(UTF8.readString(in), UTF8.readString(in));
-    }
+    metadata.readFields(in);                    // read meta data
   }
 
   protected final void writeCompressed(DataOutput out) throws IOException {
@@ -95,13 +92,7 @@
 
     UTF8.writeString(out, contentType);           // write contentType
     
-    out.writeInt(metadata.size());                // write metadata
-    Iterator i = metadata.entrySet().iterator();
-    while (i.hasNext()) {
-      Map.Entry e = (Map.Entry)i.next();
-      UTF8.writeString(out, (String)e.getKey());
-      UTF8.writeString(out, (String)e.getValue());
-    }
+    metadata.write(out);                           // write metadata
   }
 
   public static Content read(DataInput in) throws IOException {

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java?rev=357334&r1=357333&r2=357334&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java 
Sat Dec 17 02:06:31 2005
@@ -16,15 +16,22 @@
 
 package org.apache.nutch.protocol;
 
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Enumeration;
 import java.util.Iterator;
 import java.util.Properties;
 import java.util.TreeMap;
 
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.io.Writable;
+
 /**
- * case insensitive properties
+ * writable case insensitive properties
  */
-public class ContentProperties extends TreeMap {
+public class ContentProperties extends TreeMap implements Writable {
 
     /**
      * construct the TreeMap with a case insensitive comparator
@@ -51,6 +58,36 @@
         return (String) get(key);
     }
 
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.util.Map#get(java.lang.Object)
+     */
+    public Object get(Object arg0) {
+        Object object = super.get(arg0);
+        if (object != null && object instanceof ArrayList) {
+            ArrayList list = (ArrayList) object;
+            return list.get(list.size() - 1);
+        }
+        return object;
+    }
+
+    /**
+     * @param key
+     * @return the properties as a string array if there is no such property we
+     *         retunr a array with 0 entries
+     */
+    public String[] getProperties(String key) {
+        Object object = super.get(key);
+        if (object != null && !(object instanceof ArrayList)) {
+            return new String[] { (String) object };
+        } else if (object != null && object instanceof ArrayList) {
+            ArrayList list = (ArrayList) object;
+            return (String[]) list.toArray(new String[list.size()]);
+        }
+        return new String[0];
+    }
+
     /**
      * sets the key value tuple
      * 
@@ -58,7 +95,17 @@
      * @param value
      */
     public void setProperty(String key, String value) {
-        put(key, value);
+        Object object = super.get(key);
+        if (object != null && !(object instanceof ArrayList)) {
+            ArrayList arrayList = new ArrayList();
+            arrayList.add(object);
+            arrayList.add(value);
+            put(key, arrayList);
+        } else if (object instanceof ArrayList) {
+            ((ArrayList) object).add(value);
+        } else {
+            put(key, value);
+        }
 
     }
 
@@ -83,6 +130,74 @@
             return fIterator.next();
         }
 
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see org.apache.nutch.io.Writable#write(java.io.DataOutput)
+     */
+    public final void write(DataOutput out) throws IOException {
+        out.writeInt(keySet().size());
+        Iterator iterator = keySet().iterator();
+        String key;
+        String[] properties;
+        while (iterator.hasNext()) {
+            key = (String) iterator.next();
+            UTF8.writeString(out, key);
+            properties = getProperties(key);
+            out.writeInt(properties.length);
+            for (int i = 0; i < properties.length; i++) {
+                UTF8.writeString(out, properties[i]);
+            }
+        }
+
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see org.apache.nutch.io.Writable#readFields(java.io.DataInput)
+     */
+    public final void readFields(DataInput in) throws IOException {
+        int keySize = in.readInt();
+        String key;
+        for (int i = 0; i < keySize; i++) {
+            key = UTF8.readString(in);
+            int valueSize = in.readInt();
+            for (int j = 0; j < valueSize; j++) {
+                setProperty(key, UTF8.readString(in));
+            }
+        }
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    public boolean equals(Object obj) {
+        if (!(obj instanceof ContentProperties)) {
+            return false;
+        }
+        ContentProperties properties = (ContentProperties) obj;
+        Enumeration enumeration = properties.propertyNames();
+        while (enumeration.hasMoreElements()) {
+            String key = (String) enumeration.nextElement();
+            String[] values = properties.getProperties(key);
+            String[] myValues = getProperties(key);
+            if (values.length != myValues.length) {
+                return false;
+            }
+            for (int i = 0; i < values.length; i++) {
+                if (!values[i].equals(myValues[i])) {
+                    return false;
+                }
+
+            }
+        }
+
+        return true;
     }
 
 }


Reply via email to