Author: jnioche
Date: Tue Jan  4 19:30:38 2011
New Revision: 1055148

URL: http://svn.apache.org/viewvc?rev=1055148&view=rev
Log:
applied NUTCH-716 to 1.3

Modified:
    nutch/branches/branch-1.3/CHANGES.txt
    nutch/branches/branch-1.3/conf/schema.xml
    
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
    
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java

Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1055148&r1=1055147&r2=1055148&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Tue Jan  4 19:30:38 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.3 - Current Development
 
+* NUTCH-716 Make subcollection index filed multivalued (Dmitry Lihachev via 
jnioche)
+
 * NUTCH-905 Configurable file protocol parent directory crawling (Thorsten 
Scherler, mattmann, ab)
 
 * NUTCH-787 ScoringFilters should not override the injected score (jnioche)

Modified: nutch/branches/branch-1.3/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/conf/schema.xml?rev=1055148&r1=1055147&r2=1055148&view=diff
==============================================================================
--- nutch/branches/branch-1.3/conf/schema.xml (original)
+++ nutch/branches/branch-1.3/conf/schema.xml Tue Jan  4 19:30:38 2011
@@ -91,7 +91,7 @@
 
         <!-- fields for subcollection plugin -->
         <field name="subcollection" type="string" stored="true"
-            indexed="true"/>
+            indexed="true" multiValued="true"/>
 
         <!-- fields for feed plugin -->
         <field name="author" type="string" stored="true" indexed="true"/>

Modified: 
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=1055148&r1=1055147&r2=1055148&view=diff
==============================================================================
--- 
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
 (original)
+++ 
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
 Tue Jan  4 19:30:38 2011
@@ -22,9 +22,12 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
 
 import org.apache.commons.logging.Log;
@@ -89,7 +92,7 @@ public class CollectionManager extends C
           .getElementsByTagName(Subcollection.TAG_COLLECTION);
 
       if (LOG.isInfoEnabled()) {
-        LOG.info("file has" + nodeList.getLength() + " elements");
+        LOG.info("file has " + nodeList.getLength() + " elements");
       }
       
       for (int i = 0; i < nodeList.getLength(); i++) {
@@ -115,7 +118,7 @@ public class CollectionManager extends C
         impl=new CollectionManager(conf);
         objectCache.setObject(key,impl);
       } catch (Exception e) {
-        throw new RuntimeException("Couldn't create CollectionManager",e);
+        throw new RuntimeException("Couldn't create CollectionManager", e);
       }
     }
     return impl;
@@ -169,22 +172,21 @@ public class CollectionManager extends C
    *          The url to test against Collections
    * @return Space delimited string of collection names url is part of
    */
-  public String getSubCollections(final String url) {
-    StringBuilder collections = new StringBuilder();
+  public List<String> getSubCollections(final String url) {
+    List<String> collections = new ArrayList<String>();
     final Iterator iterator = collectionMap.values().iterator();
 
     while (iterator.hasNext()) {
       final Subcollection subCol = (Subcollection) iterator.next();
       if (subCol.filter(url) != null) {
-        if (collections.length() > 0) {
-          collections.append(' ');
-        }
-        collections.append(subCol.name);
+        collections.add(subCol.name);
       }
     }
-    if (LOG.isTraceEnabled()) { LOG.trace("subcollections:" + collections); }
+    if (LOG.isTraceEnabled()) { 
+      LOG.trace("subcollections:" + Arrays.toString(collections.toArray())); 
+    }
     
-    return collections.toString();
+    return collections;
   }
 
   /**

Modified: 
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=1055148&r1=1055147&r2=1055148&view=diff
==============================================================================
--- 
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
 (original)
+++ 
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
 Tue Jan  4 19:30:38 2011
@@ -62,8 +62,9 @@ public class SubcollectionIndexingFilter
    * @param url
    */
   private void addSubCollectionField(NutchDocument doc, String url) {
-    String collname = 
CollectionManager.getCollectionManager(getConf()).getSubCollections(url);
-    doc.add(FIELD_NAME, collname);
+    for (String collname: 
CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) {
+      doc.add(FIELD_NAME, collname);
+    }
   }
 
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url, 
CrawlDatum datum, Inlinks inlinks) throws IndexingException {


Reply via email to