Author: jnioche
Date: Tue Jan 4 19:30:38 2011
New Revision: 1055148
URL: http://svn.apache.org/viewvc?rev=1055148&view=rev
Log:
applied NUTCH-716 to 1.3
Modified:
nutch/branches/branch-1.3/CHANGES.txt
nutch/branches/branch-1.3/conf/schema.xml
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
Modified: nutch/branches/branch-1.3/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1055148&r1=1055147&r2=1055148&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Tue Jan 4 19:30:38 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.3 - Current Development
+* NUTCH-716 Make subcollection index filed multivalued (Dmitry Lihachev via
jnioche)
+
* NUTCH-905 Configurable file protocol parent directory crawling (Thorsten
Scherler, mattmann, ab)
* NUTCH-787 ScoringFilters should not override the injected score (jnioche)
Modified: nutch/branches/branch-1.3/conf/schema.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/conf/schema.xml?rev=1055148&r1=1055147&r2=1055148&view=diff
==============================================================================
--- nutch/branches/branch-1.3/conf/schema.xml (original)
+++ nutch/branches/branch-1.3/conf/schema.xml Tue Jan 4 19:30:38 2011
@@ -91,7 +91,7 @@
<!-- fields for subcollection plugin -->
<field name="subcollection" type="string" stored="true"
- indexed="true"/>
+ indexed="true" multiValued="true"/>
<!-- fields for feed plugin -->
<field name="author" type="string" stored="true" indexed="true"/>
Modified:
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=1055148&r1=1055147&r2=1055148&view=diff
==============================================================================
---
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
(original)
+++
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
Tue Jan 4 19:30:38 2011
@@ -22,9 +22,12 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
@@ -89,7 +92,7 @@ public class CollectionManager extends C
.getElementsByTagName(Subcollection.TAG_COLLECTION);
if (LOG.isInfoEnabled()) {
- LOG.info("file has" + nodeList.getLength() + " elements");
+ LOG.info("file has " + nodeList.getLength() + " elements");
}
for (int i = 0; i < nodeList.getLength(); i++) {
@@ -115,7 +118,7 @@ public class CollectionManager extends C
impl=new CollectionManager(conf);
objectCache.setObject(key,impl);
} catch (Exception e) {
- throw new RuntimeException("Couldn't create CollectionManager",e);
+ throw new RuntimeException("Couldn't create CollectionManager", e);
}
}
return impl;
@@ -169,22 +172,21 @@ public class CollectionManager extends C
* The url to test against Collections
* @return Space delimited string of collection names url is part of
*/
- public String getSubCollections(final String url) {
- StringBuilder collections = new StringBuilder();
+ public List<String> getSubCollections(final String url) {
+ List<String> collections = new ArrayList<String>();
final Iterator iterator = collectionMap.values().iterator();
while (iterator.hasNext()) {
final Subcollection subCol = (Subcollection) iterator.next();
if (subCol.filter(url) != null) {
- if (collections.length() > 0) {
- collections.append(' ');
- }
- collections.append(subCol.name);
+ collections.add(subCol.name);
}
}
- if (LOG.isTraceEnabled()) { LOG.trace("subcollections:" + collections); }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("subcollections:" + Arrays.toString(collections.toArray()));
+ }
- return collections.toString();
+ return collections;
}
/**
Modified:
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=1055148&r1=1055147&r2=1055148&view=diff
==============================================================================
---
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
(original)
+++
nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
Tue Jan 4 19:30:38 2011
@@ -62,8 +62,9 @@ public class SubcollectionIndexingFilter
* @param url
*/
private void addSubCollectionField(NutchDocument doc, String url) {
- String collname =
CollectionManager.getCollectionManager(getConf()).getSubCollections(url);
- doc.add(FIELD_NAME, collname);
+ for (String collname:
CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) {
+ doc.add(FIELD_NAME, collname);
+ }
}
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {